Skip to content

Commit e9c6b0b

Browse files
committed
ENH update 01_Taxonomy_mapping
1 parent 34189b1 commit e9c6b0b

File tree

9 files changed

+167
-152
lines changed

9 files changed

+167
-152
lines changed

General_Scripts/01_Taxonomy_mapping/01_map_prog_taxa_dedup.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
'''
22
Concept:
3-
Map taxonomy of smORFs from Progenomes.
43
De duplicate of smORFs from Progenomes.
4+
Map taxonomy of smORFs from Progenomes.
55
'''
66

77
import pandas as pd
@@ -42,12 +42,12 @@ def dedup_fasta(infile,outfile1,outfile2):
4242
out2.close()
4343
print("finish dedup and sort")
4444

45-
prog = "./taxa/progenome/genome_prog.tsv"
46-
taxa = "./taxa/progenome/specI_genome_taxa.txt"
47-
prog_taxa = "./taxa/progenome/prog_specI_genome_taxa.tsv"
45+
prog = "genome_prog.tsv"
46+
taxa = "specI_genome_taxa.txt"
47+
prog_taxa = "prog_specI_genome_taxa.tsv"
4848
maptaxa_tsv(prog,taxa,prog_taxa)
4949

5050
INPUT_FILE = "GMSC10.ProG_smorfs.faa.gz"
51-
OUTPUT_FILE_1 = "./taxa/progenome/prog_dedup_sort.faa.gz"
52-
OUTPUT_FILE_2 = "./taxa/progenome/prog_redundant.tsv.gz"
51+
OUTPUT_FILE_1 = "prog_dedup_sort.faa.gz"
52+
OUTPUT_FILE_2 = "prog_redundant.tsv.gz"
5353
dedup_fasta(INPUT_FILE,OUTPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/01_Taxonomy_mapping/03_lca_change_format.py

Lines changed: 38 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,34 +8,32 @@ def mergeall(infile1,infile2,outfile):
88
name = {}
99
with gzip.open(infile1,"rt") as f1:
1010
for line in f1:
11-
line = line.strip()
12-
linelist = line.split("\t")
11+
linelist = line.strip().split("\t")
1312
if linelist[0] in name.keys():
1413
name[linelist[0]].append(linelist[1])
1514
else:
1615
name[linelist[0]] = [linelist[1]]
1716

18-
out1 = gzip.open(outfile, "wt", compresslevel=1)
19-
17+
out1 = gzip.open(outfile, "wt", compresslevel=1)
2018
with open(infile2,"rt") as f2:
2119
for line in f2:
22-
line = line.strip()
23-
linelist = line.split("\t")
24-
for i in range (len(name[linelist[1]])):
25-
out1.write(linelist[0]+"\t"+name[linelist[1]][i]+"\n")
20+
linelist = line.strip().split("\t")
21+
for item in name[linelist[1]]:
22+
out1.write(f'{linelist[0]}\t{item}\n')
2623
out1.close()
2724

2825
def LCA(infile1,infile2,outfile):
2926
import gzip
27+
3028
name = {}
3129
cluster = {}
3230
change = {}
3331
taxa = set()
3432
flag = 1
33+
3534
with gzip.open(infile1,"rt") as f1:
3635
for line in f1:
37-
line = line.strip()
38-
linelist = line.split("\t")
36+
linelist = line.strip().split("\t")
3937
if len(linelist) == 11:
4038
kindom = linelist[4]
4139
phylum = linelist[5]
@@ -46,15 +44,13 @@ def LCA(infile1,infile2,outfile):
4644
species = linelist[10]
4745
name[linelist[1]] = [kindom,phylum,cla,order,family,genus,species]
4846
else:
49-
name[linelist[1]] = []
47+
name[linelist[1]] = []
48+
5049
out1 = gzip.open(outfile, "wt", compresslevel=1)
5150

5251
with gzip.open(infile2,"rt") as f2:
5352
for line in f2:
54-
line = line.strip()
55-
linelist = line.split("\t")
56-
rep = linelist[0]
57-
smorf = linelist[1]
53+
rep,smorf = line.strip().split("\t")
5854
if rep in cluster.keys():
5955
cluster[rep].append(smorf)
6056
lastrep = rep
@@ -63,10 +59,10 @@ def LCA(infile1,infile2,outfile):
6359
cluster[rep] = [smorf]
6460
for rank in range(7):
6561
flag = 1
66-
for i in range(len(cluster[lastrep])):
62+
for item in cluster[lastrep]:
6763
if taxa:
68-
if name[cluster[lastrep][i]]:
69-
if name[cluster[lastrep][i]][6-rank] in taxa:
64+
if name[item]:
65+
if name[item][6-rank] in taxa:
7066
continue
7167
else:
7268
flag = 0
@@ -75,8 +71,8 @@ def LCA(infile1,infile2,outfile):
7571
else:
7672
continue
7773
else:
78-
if name[cluster[lastrep][i]]:
79-
taxa.add(name[cluster[lastrep][i]][6-rank])
74+
if name[item]:
75+
taxa.add(name[item][6-rank])
8076
else:
8177
continue
8278
if flag == 1:
@@ -89,14 +85,14 @@ def LCA(infile1,infile2,outfile):
8985
elif rank == 6 and flag == 0:
9086
break
9187
else:
92-
for x in range(len(cluster[lastrep])):
93-
if name[cluster[lastrep][x]]:
94-
change[lastrep].append(name[cluster[lastrep][x]][j])
88+
for item in cluster[lastrep]:
89+
if name[item]:
90+
change[lastrep].append(name[item][j])
9591
break
9692
else:
9793
continue
98-
for n in range(len(cluster[lastrep])):
99-
out1.write(lastrep+"\t"+cluster[lastrep][n]+"\t")
94+
for item in cluster[lastrep]:
95+
out1.write(f'{lastrep}\t{item}\t')
10096
if len(change[lastrep]) != 0 :
10197
if len(change[lastrep]) != 1:
10298
for m in range(len(change[lastrep])-1):
@@ -112,22 +108,20 @@ def LCA(infile1,infile2,outfile):
112108
lastrep = rep
113109
for rank in range(7):
114110
flag = 1
115-
for i in range(len(cluster[lastrep])):
111+
for item in cluster[lastrep]:
116112
if taxa:
117-
if name[cluster[lastrep][i]]:
118-
if name[cluster[lastrep][i]][6-rank] in taxa:
113+
if name[item]:
114+
if name[item][6-rank] in taxa:
119115
continue
120116
else:
121117
flag = 0
122-
print(flag)
123-
print(taxa)
124118
taxa = set()
125119
break
126120
else:
127121
continue
128122
else:
129-
if name[cluster[lastrep][i]]:
130-
taxa.add(name[cluster[lastrep][i]][6-rank])
123+
if name[item]:
124+
taxa.add(name[item][6-rank])
131125
else:
132126
continue
133127
if flag == 1:
@@ -140,14 +134,14 @@ def LCA(infile1,infile2,outfile):
140134
elif rank == 6 and flag == 0:
141135
break
142136
else:
143-
for x in range(len(cluster[lastrep])):
144-
if name[cluster[lastrep][x]]:
145-
change[lastrep].append(name[cluster[lastrep][x]][j])
137+
for item in cluster[lastrep]:
138+
if name[item]:
139+
change[lastrep].append(name[item][j])
146140
break
147141
else:
148142
continue
149-
for n in range(len(cluster[lastrep])):
150-
out1.write(lastrep+"\t"+cluster[lastrep][n]+"\t")
143+
for item in cluster[lastrep]:
144+
out1.write(f'{lastrep}\t{item}\t')
151145
if len(change[lastrep]) != 0 :
152146
if len(change[lastrep]) != 1:
153147
for m in range(len(change[lastrep])-1):
@@ -166,8 +160,7 @@ def change(infile1,outfile):
166160
out = gzip.open(outfile, "wt", compresslevel=1)
167161
with gzip.open(infile1,"rt") as f1:
168162
for line in f1:
169-
line = line.strip()
170-
linelist = line.split("\t")
163+
linelist = line.strip().split("\t")
171164
out.write(linelist[1])
172165
if len(linelist) > 2:
173166
for i in range(2,len(linelist)):
@@ -198,12 +191,12 @@ def change(infile1,outfile):
198191
out.write("\n")
199192
out.close()
200193

201-
INPUT_FILE_1 = "./taxa/progenome/prog_redundant.tsv.gz"
202-
INPUT_FILE_2 = "./taxa/progenome/clust_result/prog_dedup_0.9_clu.tsv"
203-
INPUT_FILE_3 = "./taxa/progenome/prog_specI_genome_taxa.tsv.gz"
204-
OUTPUT_FILE_1 = "./taxa/progenome/prog_all_0.9_clu.tsv.gz"
205-
OUTPUT_FILE_2 = "./taxa/progenome/all_taxonomy.tsv.gz"
206-
OUTPUT_FILE_3 = "./taxa/progenome/prog_taxonomy_change.tsv.gz"
194+
INPUT_FILE_1 = "prog_redundant.tsv.gz"
195+
INPUT_FILE_2 = "prog_dedup_0.9_clu.tsv"
196+
INPUT_FILE_3 = "prog_specI_genome_taxa.tsv.gz"
197+
OUTPUT_FILE_1 = "prog_all_0.9_clu.tsv.gz"
198+
OUTPUT_FILE_2 = "all_taxonomy.tsv.gz"
199+
OUTPUT_FILE_3 = "prog_taxonomy_change.tsv.gz"
207200

208201
mergeall(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
209202
LCA(INPUT_FILE_3,OUTPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/01_Taxonomy_mapping/04_map_metag_taxid_full.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
'''
22
Concept:
3-
Map taxid for smORFs from metaG according to contig.
3+
Map taxid for smORFs from metaG based on contigs.
44
Get fullname of taxonomy of taxid according to GTDB files.
55
'''
66

@@ -15,13 +15,11 @@ def maptax(infile1,infile2,outfile):
1515
out = lzma.open(outfile, "wt")
1616

1717
f2 = lzma.open(infile2,"rt")
18-
header = f2.readline()
1918
metag = f2.readline().strip().split("\t")
2019

2120
with lzma.open(infile1,"rt") as f1:
2221
for line in f1:
23-
line = line.strip()
24-
linelist = line.split("\t")
22+
linelist = line.strip().split("\t")
2523
if line.startswith("sample"):
2624
continue
2725
else:
@@ -35,9 +33,9 @@ def maptax(infile1,infile2,outfile):
3533
while metag[1] == lastsample:
3634
contig2 = metag[1]+metag[2].split(" # ")[0].split("_")[0]+"_"+metag[2].split(" # ")[0].split("_")[1]
3735
if contig2 in taxa.keys():
38-
out.write(metag[0]+"\t"+taxa[contig2]+"\n")
36+
out.write(f'{metag[0]}\t{taxa[contig2]}\n')
3937
else:
40-
out.write(metag[0]+"\n")
38+
out.write(f'{metag[0]}\n')
4139
metag = f2.readline().strip().split("\t")
4240
if metag == [""]:
4341
break
@@ -53,9 +51,9 @@ def maptax(infile1,infile2,outfile):
5351
while metag[1] == lastsample:
5452
contig2 = metag[1]+metag[2].split(" # ")[0].split("_")[0]+"_"+metag[2].split(" # ")[0].split("_")[1]
5553
if contig2 in taxa.keys():
56-
out.write(metag[0]+"\t"+taxa[contig2]+"\n")
54+
out.write(f'{metag[0]}\t{taxa[contig2]}\n')
5755
else:
58-
out.write(metag[0]+"\n")
56+
out.write(f'{metag[0]}\n')
5957
metag = f2.readline().strip().split("\t")
6058
if metag == [""]:
6159
break
@@ -74,14 +72,12 @@ def fulltax(infile1,infile2,outfile):
7472

7573
with open(infile1,"rt") as f1:
7674
for line in f1:
77-
line = line.strip()
78-
linelist = line.split("\t")
75+
linelist = line.strip().split("\t")
7976
taxonomy.append(linelist[1])
8077

8178
with lzma.open(infile2,"rt") as f2:
8279
for line in f2:
83-
line = line.strip()
84-
linelist = line.split("\t")
80+
linelist = line.strip().split("\t")
8581
if line.startswith("sample"):
8682
continue
8783
else:
@@ -92,7 +88,7 @@ def fulltax(infile1,infile2,outfile):
9288
flag = 0
9389
if linelist[3] == "superkingdom":
9490
tax = "d__"+linelist[4]
95-
outf.write(linelist[2]+"\t"+tax+"\n")
91+
outf.write(f'{linelist[2]}\t{tax}\n')
9692
else:
9793
tax = linelist[3][0]+"__"+linelist[4]
9894
for i in range(len(taxonomy)):
@@ -117,15 +113,15 @@ def fulltax(infile1,infile2,outfile):
117113
flag = 1
118114
break
119115
if flag == 0:
120-
outf.write(linelist[2]+"\n")
116+
outf.write(f'{linelist[2]}\n')
121117

122118
outf.close()
123119

124120
INPUT_FILE_1 = "mmseqs2.lca_taxonomy.full.tsv.xz"
125121
INPUT_FILE_2 = "GMSC10.metag_smorfs.rename.txt.xz"
126-
INPUT_FILE_3 = "./taxa/metag/gtdb_taxonomy.tsv"
127-
OUTPUT_FILE_1 = "./taxa/metag/metag_taxid.tsv.xz"
128-
OUTPUT_FILE_2 = "./taxa/metag/taxid_fullname_gtdb.tsv"
122+
INPUT_FILE_3 = "gtdb_taxonomy.tsv"
123+
OUTPUT_FILE_1 = "metag_taxid.tsv.xz"
124+
OUTPUT_FILE_2 = "taxid_fullname_gtdb.tsv"
129125

130126
maptax(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
131127
fulltax(INPUT_FILE_3,INPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/01_Taxonomy_mapping/05_dedup_cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
def splitseq(infile):
1414
print("start splitseq")
1515
outputlist = [
16-
f'/taxa/metag/dedup_cluster/split/sub_{ix:03}.faa.gz'
16+
f'sub_{ix:03}.faa.gz'
1717
for ix in range(256)]
1818
outputfiles = [
19-
gzip.open(f'/taxa/metag/dedup_cluster/split/sub_{ix:03}.faa.gz',compresslevel=1, mode='wt')
19+
gzip.open(f'sub_{ix:03}.faa.gz',compresslevel=1, mode='wt')
2020
for ix in range(256)]
2121
for ID,seq in fasta_iter(infile):
2222
h = hashlib.sha256()

General_Scripts/01_Taxonomy_mapping/06_map_taxonomy.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,15 @@ def metag_full(infile1,infile2,outpath):
2727

2828
with open(infile1,"rt") as f1:
2929
for line in f1:
30-
line = line.strip()
31-
linelist = line.split("\t",1)
30+
linelist = line.strip().split("\t",1)
3231
if len(linelist) == 2:
3332
tax[linelist[0]] = linelist[1]
3433
else:
3534
continue
35+
3636
with lzma.open(infile2,"rt") as f2:
3737
for line in f2:
38-
line = line.strip()
39-
linelist = line.split("\t")
38+
linelist = line.strip().split("\t")
4039
if n < 600000000:
4140
if len(linelist) == 2:
4241
if linelist[1] in tax.keys():
@@ -117,42 +116,38 @@ def metag_full(infile1,infile2,outpath):
117116
'''
118117
def map_cluster(infile1,infile2,infile3,outfile):
119118
tax = {}
120-
n = 0
121119
out = lzma.open(outfile, "wt")
122120
with gzip.open(infile1,"rt") as f1:
123121
for line in f1:
124-
line = line.strip()
125-
linelist = line.split("\t",1)
122+
linelist = line.strip().split("\t",1)
126123
if len(linelist) == 2:
127124
tax[linelist[0]] = linelist[1]
128125
else:
129126
continue
130127

131128
with lzma.open(infile2,"rt") as f2:
132129
for line in f2:
133-
line = line.strip()
134-
linelist = line.split("\t",1)
130+
linelist = line.strip().split("\t",1)
135131
if len(linelist) == 2:
136132
tax[linelist[0]] = linelist[1]
137133
else:
138134
continue
139135

140136
with gzip.open(infile3,"rt") as f4:
141137
for line in f4:
142-
line = line.strip()
143-
linelist = line.split("\t")
138+
linelist = line.strip().split("\t")
144139
if linelist[1] in tax.keys():
145140
out.write(linelist[0]+"\t"+linelist[1]+"\t"+tax[linelist[1]]+"\n")
146141
else:
147142
out.write(linelist[0]+"\t"+linelist[1]+"\n")
148143
out.close()
149144

150-
INPUT_FILE_1 = "./taxa/metag/taxid_fullname_gtdb.tsv"
151-
INPUT_FILE_2 = "./taxa/metag/metag_taxid.tsv.xz"
145+
INPUT_FILE_1 = "taxid_fullname_gtdb.tsv"
146+
INPUT_FILE_2 = "metag_taxid.tsv.xz"
152147
INPUT_FILE_3 = "dedup_cluster.tsv.gz"
153-
INPUT_FILE_4 = "./taxa/progenome/prog_taxonomy_change.tsv.gz"
154-
OUT_PATH_1 = "./taxa/metag/metag_taxonomy"
155-
OUT_PATH_2 = "./taxa/metag/metag_cluster_taxonomy"
148+
INPUT_FILE_4 = "prog_taxonomy_change.tsv.gz"
149+
OUT_PATH_1 = "metag_taxonomy"
150+
OUT_PATH_2 = "metag_cluster_taxonomy"
156151

157152
metag_full(INPUT_FILE_1,INPUT_FILE_2,OUT_PATH_1)
158153
for i in range(1,9):

0 commit comments

Comments
 (0)