Skip to content

Commit e2a06e8

Browse files
committed
ENH update 02_Habitat_mapping
1 parent e9c6b0b commit e2a06e8

File tree

5 files changed

+110
-158
lines changed

5 files changed

+110
-158
lines changed

General_Scripts/02_Habitat_mapping/01_map_habitat.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
'''
22
Concept:
33
Map habitat for all the smORFs from metaG.
4-
We split all the smORFs into 8 subfiles because of its large number.
4+
Split all the smORFs into 8 subfiles because of the large size.
55
Map habitat to raw data non-redundant cluster.
66
'''
77

@@ -27,23 +27,22 @@ def habitat(infile1,infile2,outpath):
2727
n = 0
2828
with open(infile1,'r',encoding = 'utf-8') as f1:
2929
for line in f1:
30-
line = line.strip()
30+
linelist = line.strip().split("\t")
3131
if line.startswith("sample"):
3232
continue
3333
else:
34-
linelist = line.split("\t")
3534
if len(linelist) > 20:
3635
if linelist[20] != "":
3736
micro_host[linelist[0]] = linelist[9]+" # "+linelist[20]
3837
else:
3938
micro_host[linelist[0]] = linelist[9]
39+
4040
with lzma.open(infile2,'rt') as f2:
4141
for line in f2:
42-
line = line.strip()
42+
linelist = line.strip().split("\t")
4343
if line.startswith("#GMSC"):
4444
continue
4545
else:
46-
linelist = line.split("\t")
4746
if n < 600000000:
4847
out1.write(linelist[0]+"\t"+micro_host[linelist[1]]+"\n")
4948
elif n >= 600000000 and n < 1200000000:
@@ -80,33 +79,31 @@ def map_cluster(infile1,infile2,outfile):
8079

8180
with lzma.open(infile1,"rt") as f1:
8281
for line in f1:
83-
line = line.strip()
84-
linelist = line.split("\t",1)
82+
linelist = line.strip().split("\t",1)
8583
if len(linelist) == 2:
8684
habitat[linelist[0]] = linelist[1]
8785
else:
8886
continue
8987

9088
with gzip.open(infile2,"rt") as f2:
9189
for line in f2:
92-
line = line.strip()
93-
linelist = line.split("\t")
90+
linelist = line.strip().split("\t")
9491
number = int(''.join(linelist[1].split(".")[2].split("_")))
9592
if number < 34617405: #The habitat of smORFs from Progenome is named isolate.
96-
out.write(linelist[0]+"\t"+linelist[1]+"\t"+"isolate"+"\n")
93+
out.write(f'{linelist[0]}\t{linelist[1]}\tisolate\n')
9794
else:
9895
if linelist[1] in habitat.keys():
99-
out.write(linelist[0]+"\t"+linelist[1]+"\t"+habitat[linelist[1]]+"\n")
96+
out.write(f'{linelist[0]}\t{linelist[1]}\t{habitat[linelist[1]]}\n')
10097
else:
101-
out.write(line+"\n")
98+
out.write(f'{line}')
10299
out.close()
103100

104101

105-
INPUT_FILE_1 = "./habitat/metadata.tsv"
102+
INPUT_FILE_1 = "metadata.tsv"
106103
INPUT_FILE_2 = "GMSC10.metag_smorfs.rename.txt.xz"
107104
INPUT_FILE_3 = "dedup_cluster.tsv.gz"
108-
OUT_PATH_1 = "./habitat/metag_habitat"
109-
OUT_PATH_2 = "./habitat/metag_cluster_habitat"
105+
OUT_PATH_1 = "metag_habitat"
106+
OUT_PATH_2 = "metag_cluster_habitat"
110107

111108
habitat(INPUT_FILE_1,INPUT_FILE_2,OUT_PATH_1)
112109

General_Scripts/02_Habitat_mapping/02_multi_habitat.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ def multi_habitat(infile,outfile):
1515

1616
with lzma.open(infile,"rt") as f1:
1717
for line in f1:
18-
line = line.strip()
19-
cluster,metag,habitat = line.split("\t")
18+
cluster,metag,habitat = line.strip().split("\t")
2019
if cluster_dict:
2120
if cluster in cluster_dict:
2221
cluster_dict[cluster].append(metag)
@@ -25,7 +24,7 @@ def multi_habitat(infile,outfile):
2524
multihabitat = ",".join(sorted(list(habitat_set)))
2625
for key,value in cluster_dict.items():
2726
for smorf in value:
28-
out.write(key+"\t"+smorf+"\t"+multihabitat+"\n")
27+
out.write(f'{key}\t{smorf}\t{multihabitat}\n')
2928
cluster_dict = {}
3029
habitat_set = set()
3130
cluster_dict[cluster] = [metag]
@@ -37,60 +36,57 @@ def multi_habitat(infile,outfile):
3736

3837
def extract(infile1,infile2,outfile):
3938
import lzma
39+
import gzip
4040
smorf = set()
4141
out = lzma.open(outfile, "wt")
4242

43-
with lzma.open(infile1,"rt") as f1:
43+
with gzip.open(infile1,"rt") as f1:
4444
for line in f1:
45-
line = line.strip()
46-
linelist = line.split("\t")
47-
smorf.add(linelist[0])
45+
member,cluster = line.strip().split("\t")
46+
smorf.add(member)
4847

4948
with lzma.open(infile2,"rt") as f2:
5049
for line in f2:
51-
line = line.strip()
52-
linelist = line.split("\t")
53-
if linelist[1] in smorf:
54-
out.write(linelist[1]+"\t"+linelist[2]+"\n")
50+
seq,habitat = line.strip().split("\t")
51+
if seq in smorf:
52+
out.write(f'{seq}\t{habitat}\n')
5553
else:
5654
continue
57-
5855
out.close()
5956

6057
def general(infile1,infile2,outfile):
6158
import lzma
6259
out = lzma.open(outfile,"wt")
6360
env = {}
61+
6462
with open(infile1,"rt") as f1:
6563
for line in f1:
66-
line = line.strip()
67-
sample,amp,microontology,name,host,habitat = line.split("\t")
64+
sample,amp,microontology,name,host,habitat = line.strip().split("\t")
6865
if host != "":
6966
fullhabitat = microontology + " # " + host
7067
else:
7168
fullhabitat = microontology
7269
env[fullhabitat] = habitat
7370
env["isolate"] = "isolate"
71+
7472
with lzma.open(infile2,"rt") as f2:
7573
for line in f2:
76-
line = line.strip()
77-
smorf,multihabitat = line.split("\t")
74+
smorf,multihabitat = line.strip().split("\t")
7875
multilist = multihabitat.split(",")
7976
change = set()
8077
for i in multilist:
8178
i = i.replace("-"," ")
8279
change.add(env[i])
8380
generalhabitat = ",".join(sorted(list(change)))
84-
out.write(smorf+"\t"+generalhabitat+"\n")
85-
81+
out.write(f'{smorf}\t{generalhabitat}\n')
8682
out.close()
8783

88-
INPUT_FILE_1 = "./habitat/metag_cluster_habitat.tsv.xz"
89-
INPUT_FILE_2 = "./frozen/100AA_rename.tsv.xz"
90-
INPUT_FILE_3 = "./habitat/habitat_general.txt"
91-
OUTPUT_FILE_1 = "./habitat/all_cluster_multi_habitat.tsv.xz"
92-
OUTPUT_FILE_2 = "./habitat/id100/100AA_multi_habitat.tsv.xz"
93-
OUTPUT_FILE_3 = "./habitat/id100/100AA_multi_general_habitat.tsv.xz"
84+
INPUT_FILE_1 = "metag_cluster_habitat.tsv.xz"
85+
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
86+
INPUT_FILE_3 = "habitat_general.txt"
87+
OUTPUT_FILE_1 = "all_cluster_multi_habitat.tsv.xz"
88+
OUTPUT_FILE_2 = "100AA_multi_habitat.tsv.xz"
89+
OUTPUT_FILE_3 = "100AA_multi_general_habitat.tsv.xz"
9490

9591
multi_habitat(INPUT_FILE_1,OUTPUT_FILE_1)
9692
extract(INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)
Lines changed: 76 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,96 @@
11
'''
22
Concept:
33
Map habitat to 90% identity clusters.
4+
Combine multiple habitats for each smORF from the same cluster.
5+
Change habitats to general name.
46
'''
57

68
import gzip
79
import lzma
810

9-
'''
10-
Change format of 90% identity clusters including two columns(90AA clusters and 100AA name)
11-
'''
12-
def change_format_90(infile1,outfile):
13-
cluster = {}
11+
def mapcluster(infile1,infile2,outfile):
12+
habitat = {}
1413
out = lzma.open(outfile, "wt")
15-
with gzip.open(infile1,"rt") as f1:
14+
with lzma.open(infile1,"rt") as f1:
15+
for line in f1:
16+
smorf,habitat = line.strip().split("\t",1)
17+
habitat[smorf] = habitat
18+
19+
with gzip.open(infile2,"rt") as f2:
20+
for line in f2:
21+
member,cluster = line.strip().split("\t")
22+
out.write(f'{cluster}\t{member}\t{habitat[member]}\n')
23+
out.close()
24+
25+
def multi_habitat(infile,outfile):
26+
import lzma
27+
28+
cluster_dict = {}
29+
habitat_set = set()
30+
out = lzma.open(outfile, "wt")
31+
32+
with lzma.open(infile,"rt") as f1:
1633
for line in f1:
17-
line = line.strip()
18-
linelist = line.split("\t")
19-
if len(linelist) == 3:
20-
if linelist[2] in cluster.keys():
21-
cluster[linelist[2]].append(linelist[0])
34+
cluster,metag,habitat = line.strip().split("\t")
35+
if cluster_dict:
36+
if cluster in cluster_dict:
37+
cluster_dict[cluster].append(metag)
38+
habitatlist = habitat.split(",")
39+
for h in habitatlist:
40+
habitat_set.add(h)
2241
else:
23-
cluster[linelist[2]] = [linelist[0]]
42+
multihabitat = ",".join(sorted(list(habitat_set)))
43+
for key,value in cluster_dict.items():
44+
for smorf in value:
45+
out.write(f'{key}\t{smorf}\t{multihabitat}\n')
46+
cluster_dict = {}
47+
habitat_set = set()
48+
cluster_dict[cluster] = [metag]
49+
habitatlist = habitat.split(",")
50+
for h in habitatlist:
51+
habitat_set.add(h)
2452
else:
25-
cluster[linelist[0]] = [linelist[0]]
26-
for key,value in cluster.items():
27-
for i in range(len(value)):
28-
out.write(key+"\t"+value[i]+"\n")
29-
out.close()
53+
cluster_dict[cluster] = [metag]
54+
habitatlist = habitat.split(",")
55+
for h in habitatlist:
56+
habitat_set.add(h)
57+
out.close()
3058

31-
'''
32-
Map habitat to 90% identity clusters.
33-
'''
34-
def mapcluster(infile1,infile2,outfile):
35-
habitat = {}
36-
out = lzma.open(outfile, "wt")
37-
with lzma.open(infile1,"rt") as f1:
59+
def general(infile1,infile2,outfile):
60+
import lzma
61+
out = lzma.open(outfile,"wt")
62+
env = {}
63+
64+
with open(infile1,"rt") as f1:
3865
for line in f1:
39-
line = line.strip()
40-
linelist = line.split("\t",1)
41-
habitat[linelist[0]] = linelist[1]
66+
sample,amp,microontology,name,host,habitat = line.strip().split("\t")
67+
if host != "":
68+
fullhabitat = microontology + " # " + host
69+
else:
70+
fullhabitat = microontology
71+
env[fullhabitat] = habitat
72+
env["isolate"] = "isolate"
4273

4374
with lzma.open(infile2,"rt") as f2:
4475
for line in f2:
45-
line = line.strip()
46-
linelist = line.split("\t")
47-
out.write(linelist[0]+"\t"+linelist[1]+"\t"+habitat[linelist[1]]+"\n")
48-
76+
smorf_cluster,smorf,multihabitat = line.strip().split("\t")
77+
multilist = multihabitat.split(",")
78+
change = set()
79+
for i in multilist:
80+
i = i.replace("-"," ")
81+
change.add(env[i])
82+
generalhabitat = ",".join(sorted(list(change)))
83+
out.write(f'{smorf_cluster}\t{smorf}\t{generalhabitat}\n')
84+
4985
out.close()
5086

51-
INPUT_FILE_1 = "./clust_result/result/all_0.5_0.9.tsv.gz"
52-
INPUT_FILE_2 = "./habitat/id100/100AA_multi_general_habitat.tsv.xz"
53-
OUTPUT_FILE_1 = "all_cluster_0.9.tsv.xz"
54-
OUTPUT_FILE_2 = "./habitat/id90/cluster_multi_habitat_90.tsv.xz"
87+
INPUT_FILE_1 = "100AA_multi_general_habitat.tsv.xz"
88+
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
89+
INPUT_FILE_3 = "habitat_general.txt"
90+
OUTPUT_FILE_1 = "cluster_multi_habitat_90.tsv.xz"
91+
OUTPUT_FILE_2 = "90AA_multi_habitat.tsv.xz"
92+
OUTPUT_FILE_3 = "90AA_multi_general_habitat.tsv.xz"
5593

56-
change_format_90(INPUT_FILE_1,OUTPUT_FILE_1)
57-
mapcluster(INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)
94+
mapcluster(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
95+
multi_habitat(OUTPUT_FILE_1,OUTPUT_FILE_2)
96+
general(INPUT_FILE_3,OUTPUT_FILE_2,OUTPUT_FILE_3)

General_Scripts/02_Habitat_mapping/04_multi_habitat_90_50.py

Lines changed: 0 additions & 79 deletions
This file was deleted.

0 commit comments

Comments
 (0)