Skip to content

Commit 164a458

Browse files
committed
ENH update 08_Conserved_domain_annotation
1 parent c73ddc3 commit 164a458

File tree

16 files changed

+188
-189
lines changed

16 files changed

+188
-189
lines changed

General_Scripts/02_Habitat_mapping/02_multi_habitat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def general(infile1,infile2,outfile):
8686
INPUT_FILE_3 = "habitat_general.txt"
8787
OUTPUT_FILE_1 = "all_cluster_multi_habitat.tsv.xz"
8888
OUTPUT_FILE_2 = "100AA_multi_habitat.tsv.xz"
89-
OUTPUT_FILE_3 = "100AA_multi_general_habitat.tsv.xz"
89+
OUTPUT_FILE_3 = "GMSC10.100AA.general_habitat.tsv.xz"
9090

9191
multi_habitat(INPUT_FILE_1,OUTPUT_FILE_1)
9292
extract(INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/02_Habitat_mapping/03_map_cluster_habitat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,12 @@ def general(infile1,infile2,outfile):
8484

8585
out.close()
8686

87-
INPUT_FILE_1 = "100AA_multi_general_habitat.tsv.xz"
87+
INPUT_FILE_1 = "GMSC10.100AA.general_habitat.tsv.xz"
8888
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
8989
INPUT_FILE_3 = "habitat_general.txt"
9090
OUTPUT_FILE_1 = "cluster_multi_habitat_90.tsv.xz"
9191
OUTPUT_FILE_2 = "90AA_multi_habitat.tsv.xz"
92-
OUTPUT_FILE_3 = "90AA_multi_general_habitat.tsv.xz"
92+
OUTPUT_FILE_3 = "GMSC10.90AA.general_habitat.tsv.xz"
9393

9494
mapcluster(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
9595
multi_habitat(OUTPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/02_Habitat_mapping/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
| **Code** | **Description** | **Input** | **Output** |
44
| :---: | :---: | :---: | :---: |
55
| 01_map_habitat.py | Map habitat for all the smORFs from metaG | metadata.tsv GMSC10.metag_smorfs.rename.txt.xz dedup_cluster.tsv.gz| metag_cluster_habitat.tsv.xz |
6-
| 02_multi_habitat.py | Combine multiple habitats for each smORF from the same cluster | metag_cluster_habitat.tsv.xz GMSC.cluster.tsv.gz habitat_general.txt| 100AA_multi_general_habitat.tsv.xz |
7-
| 03_map_cluster_habitat.py | Map multiple habitats to 90% identity smORFs clusters. | GMSC.cluster.tsv.gz 100AA_multi_general_habitat.tsv.xz habitat_general.txt| 90AA_multi_general_habitat.tsv.xz |
6+
| 02_multi_habitat.py | Combine multiple habitats for each smORF from the same cluster | metag_cluster_habitat.tsv.xz GMSC.cluster.tsv.gz habitat_general.txt| GMSC10.100AA.general_habitat.tsv.xz |
7+
| 03_map_cluster_habitat.py | Map multiple habitats to 90% identity smORFs clusters. | GMSC.cluster.tsv.gz 100AA_multi_general_habitat.tsv.xz habitat_general.txt| GMSC10.90AA.general_habitat.tsv.xz |

General_Scripts/08_Conserved_domain_annotation/01_Annotation/01_cdd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
# Concept:
44
# Map to CDD database
55

6-
rpsblast -query 90AA_GMSC.faa -out 90AA_cdd.tsv -db ~/Cdd -num_threads 20 -evalue 0.01 -outfmt "6 qseqid sseqid qlen score length pident evalue"
6+
rpsblast -query 90AA_GMSC.faa -out 90AA_cdd.tsv -db ./Cdd -num_threads 20 -evalue 0.01 -outfmt "6 qseqid sseqid qlen score length pident evalue"

General_Scripts/08_Conserved_domain_annotation/01_Annotation/02_add_pssm_length.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
'''
2+
Concept:
23
Add length of PSSM and filter with target coverage >80%.
34
'''
45
def add_length(infile1,infile2,outfile):
@@ -23,6 +24,7 @@ def add_length(infile1,infile2,outfile):
2324
out.write(f'{line}\t{cdd_dict[pssm]}\n')
2425

2526
def filter_cov(infile,outfile):
27+
import pandas as pd
2628
result = pd.read_csv(infile,compression='gzip',sep='\t',header=None,names=['smorf','cdd','query_length','score','align_length','identity','evalue','target_length'])
2729
result['tcov'] = result['align_length']/result['target_length']
2830
result = result[result['tcov'] >0.8]
Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
'''
2-
Analyse if smORFs 90AA families are specific or multiple at the taxonomy rank.
2+
Concept:
3+
Analyse if 90AA families are specific or multiple at the taxonomy rank.
34
'''
45

56
import pandas as pd
@@ -51,26 +52,6 @@ def cal(infile,outfile):
5152
out.write(f'{smorf}\t{flag_multi}\t{flag_specific}\t{number}\n')
5253
out.close()
5354

54-
# change to new identifier
55-
56-
def store(infile1):
57-
import lzma
58-
name = {}
59-
with lzma.open(infile1,'rt') as f1:
60-
for line in f1:
61-
old,new = line.strip().split('\t')
62-
name[old] = new
63-
return name
64-
65-
def map_multi(name,infile,outfile):
66-
out1 = open(outfile,'wt')
67-
with open(infile,'rt') as f2:
68-
for line in f2:
69-
family,anno = line.strip().split('\t',1)
70-
if family in name.keys():
71-
out1.write(f'{name[family]}\t{anno}\n')
72-
out1.close()
73-
7455
# Calculate number of taxonomy specific
7556
def merge(infile,outfile):
7657
km = 0
@@ -158,12 +139,8 @@ def merge(infile,outfile):
158139
out.write(f'kingdom-specific\t{ok}\t{pm}\t{ps}\n')
159140

160141
infile1 = 'metag_cluster_tax_90.tsv'
161-
infile2 = '90AA_rename.tsv.xz'
162142
outfile1 = '90AA_taxa_multi_specific.tsv'
163-
outfile2 = '90AA_multi_newname.tsv'
164-
outfile3 = '90AA_specific_multi.tsv'
143+
outfile2 = '90AA_specific_multi.tsv'
165144

166145
cal(infile1,outfile1)
167-
name = store(infile2)
168-
map_multi(name,outfile1,outfile2)
169-
merge(outfile1,outfile3)
146+
merge(outfile1,outfile2)
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
'''
2+
Concept:
23
Select 90AA smORF families across all 8 habitat categories.
34
Add cdd annotation to these 90AA smORF families.
45
'''
@@ -113,7 +114,7 @@ def merge_habitat_motif(infile1,infile2,infile3,outfile):
113114
infile2 = '1_cdd_tcov_90AA.tsv.gz'
114115
infile3 = 'cddid_all.tbl.gz'
115116
outfile1 = 'all_habitat_smorf.tsv'
116-
outfile2 = 'all_habitat_motif_right.tsv'
117+
outfile2 = 'all_habitat_smorf_motif.tsv'
117118

118119
map_high(infile1,outfile1)
119120
merge_habitat_motif(infile2,outfile1,infile3,outfile2)
Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
'''
2+
Concept:
23
Calculate the species number in 90AA families.
34
'''
45
def tax_format(infile,outfile):
@@ -27,42 +28,26 @@ def cal_species(infile,outfile):
2728
for key,value in species_number.items():
2829
out.write(f'{key}\t{len(value)}\n')
2930

30-
def store(infile):
31-
import lzma
32-
name = {}
33-
with lzma.open(infile,'rt') as f:
34-
for line in f:
35-
old,new = line.strip().split('\t')
36-
name[old] = new
37-
return name
38-
39-
def select(infile2):
31+
def count(infile1,infile2,outfile):
4032
housekeeping = set()
41-
with open(infile2,'rt') as f2:
42-
for line in f2:
33+
with open(infile1,'rt') as f:
34+
for line in f:
4335
smorf,habitat = line.strip().split('\t')
4436
housekeeping.add(smorf)
45-
return housekeeping
4637

47-
def count(name,housekeeping,infile,outfile):
48-
out = open(outfile,'wt')
49-
with open(infile,'rt') as f1:
50-
for line in f1:
51-
smorf,number = line.strip().split('\t')
52-
if smorf in name.keys():
53-
if name[smorf] in housekeeping:
54-
out.write(f'{name[smorf]}\t{number}\n')
55-
out.close()
38+
with open(outfile,'wt') as out:
39+
with open(infile2,'rt') as f:
40+
for line in f:
41+
smorf,number = line.strip().split('\t')
42+
if smorf in housekeeping:
43+
out.write(f'{smorf}\t{number}\n')
5644

5745
infile1 = "metag_cluster_tax_90.tsv.xz"
58-
infile2 = '90AA_rename.tsv.xz'
59-
infile3 = 'all_habitat_smorf.tsv'
46+
infile2 = 'all_habitat_smorf.tsv'
6047
outfile1 = "metag_cluster_tax_90.tsv"
6148
outfile2 = '90AA_species_number.tsv'
6249
outfile3 = 'housekeeping_species.tsv'
6350

6451
tax_format(infile1,outfile1)
6552
cal_species(outfile1,outfile2)
66-
name = store(infile2)
67-
housekeeping = select(infile3)
68-
count(name,housekeeping,outfile2,outfile3)
53+
count(infile2,outfile2,outfile3)

General_Scripts/08_Conserved_domain_annotation/02_multi-phylum_analysis/04_merge_multi-phylum.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def merge():
3131
df.to_csv('housekeeping_motif_species_multi_phylum_all.tsv',sep='\t',index=None)
3232

3333
infile1 = 'housekeeping_species.tsv'
34-
infile2 = '90AA_multi_newname.tsv'
34+
infile2 = '90AA_taxa_multi_specific.tsv'
3535
outfile = 'housekeeping_multi.tsv'
3636

3737
map_multi(infile1,infile2,outfile)

General_Scripts/08_Conserved_domain_annotation/02_multi-phylum_analysis/05_map_sample_taxonomy.py

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@ def map_90(sample_100,infile2,outfile):
1919
sample_90 = {}
2020
with lzma.open(infile2,'rt') as f2:
2121
for line in f2:
22-
linelist = line.strip().split('\t')
23-
if linelist[1] != '':
24-
if linelist[0] in sample_100.keys():
25-
if linelist[1] not in sample_90.keys():
26-
sample_90[linelist[1]] = []
27-
sample_90[linelist[1]].append(sample_100[linelist[0]])
22+
member,cluster = line.strip().split('\t')
23+
if member in sample_100.keys():
24+
if cluster not in sample_90.keys():
25+
sample_90[cluster] = []
26+
sample_90[cluster].append(sample_100[member])
2827
sample_100 = {}
2928
with open(outfile,'wt') as out:
3029
for key,value in sample_90.items():
@@ -35,47 +34,44 @@ def map_sample(infile1,infile2,outfile):
3534
new = set()
3635
with open(infile1,'rt') as f1:
3736
for line in f1:
38-
new.add(line.strip())
37+
smorf,number = line.strip().split('\t')
38+
if number > 100:
39+
new.add(smorf)
3940

4041
with open(outfile,'wt') as out:
4142
with open(infile2,'rt') as f2:
4243
for line in f2:
43-
linelist = line.strip().split('\t')
44-
if linelist[0] in new:
44+
cluster,sample = line.strip().split('\t')
45+
if cluster in new:
4546
out.write(line)
4647

47-
def map_taxonomy(infile1,infile2,infile3,outfile):
48+
def map_taxonomy(infile1,infile2,outfile):
4849
import lzma
49-
new = set()
50-
old = {}
51-
with open(infile1,'rt') as f1:
52-
for line in f1:
53-
new.add(line.strip())
50+
seqs = set()
5451

55-
with lzma.open(infile2,'rt') as f2:
56-
for line in f2:
57-
linelist = line.strip().split('\t')
58-
if linelist[1] in new:
59-
old[linelist[0]] = linelist[1]
52+
with open(infile1,'rt') as f:
53+
for line in f:
54+
smorf,number = line.strip().split('\t')
55+
if number > 100:
56+
seqs.add(smorf)
6057

6158
with open(outfile,'wt') as out:
62-
with lzma.open(infile3,'rt') as f3:
63-
for line in f3:
64-
line = line.strip()
65-
linelist = line.split('\t',2)
66-
if linelist[0] in old:
67-
out.write(f'{old[linelist[0]]}\t{line}\n')
59+
with lzma.open(infile2,'rt') as f:
60+
for line in f:
61+
cluster,member,taxonomy = line.strip().split('\t',2)
62+
if cluster in seqs:
63+
out.write(line)
6864

6965
infile1 = '100AA_sample.tsv.xz'
70-
infile2 = 'all_0.9_0.5_family_sort.tsv.xz'
71-
infile3 = 'housekeeping.txt'
72-
infile4 = '90AA_rename.tsv.xz'
73-
infile5 = 'metag_cluster_tax_90.tsv.xz'
66+
infile2 = 'GMSC.cluster.tsv.gz'
67+
infile3 = 'housekeeping_species.tsv'
68+
infile4 = 'metag_cluster_tax_90.tsv.xz'
69+
7470
outfile1 = '90AA_sample.tsv'
7571
outfile2 = 'housekeeping_sample.txt'
7672
outfile3 = 'housekeeping_taxonomy.txt'
7773

7874
sample_100 = store_100(infile1)
7975
map_90(sample_100,infile2,outfile1)
8076
map_sample(infile3,outfile1,outfile2)
81-
map_taxonomy(infile3,infile4,infile5,outfile3)
77+
map_taxonomy(infile3,infile4,outfile3)

0 commit comments

Comments
 (0)