Skip to content

Commit b25482a

Browse files
committed
ENH update 04_forzen
1 parent f0e3d4d commit b25482a

File tree

9 files changed

+148
-132
lines changed

9 files changed

+148
-132
lines changed

General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge_extract.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,9 @@ def extract_seq(infile1,infile2,outfile1,outfile2):
8787
fastaset = set()
8888
with gzip.open(infile1,"rt") as f:
8989
for line in f:
90-
line = line.strip()
91-
linelist = line.split("\t")
92-
if linelist[0] != "1":
93-
fastaset.add(linelist[1])
90+
count,seq = line.strip().split("\t")
91+
if count != "1":
92+
fastaset.add(seq)
9493

9594
with gzip.open(outfile1, "wt", compresslevel=1) as out1, \
9695
gzip.open(outfile2, "wt", compresslevel=1) as out2:

General_Scripts/00_Remove_redundancy_and_cluster/07_identify_clusters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def identify(infile,outfile):
1313
continue
1414
else:
1515
nameset.add(linelist[0])
16-
out.write(linelist[0]+"\t"+linelist[2]+"\n")
16+
out.write(f'{linelist[2]}\t{linelist[0]}\n')
1717

1818
for i in range(24):
1919
INPUT_FILE_1 = "sub"+str(i)+".faa.gz.tsv"

General_Scripts/04_Frozen/01_rename_list.py

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,63 +2,65 @@
22
Concept:
33
Generate original name - 100AA rename list.
44
Peptides are named: >GMSC10.100AA.XXX_XXX_XXX
5-
Numbers were assigned in order of increasing number of copies.
6-
So that the lower the number, the lower the number of copies of that peptide was present in the input data.
7-
And if the number of copies is same, numbers were assigned in order of letters of peptides.
5+
Numbers were assigned in order of increasing number of copies. If the number of copies is same, numbers were assigned in order of letters of peptides.
86
'''
97

108
def sort(infile,outfile,n,prefix):
119
from operator import itemgetter
1210
import gzip
13-
seqnumber_list=[]
11+
12+
seqnumber_list = []
13+
1414
with gzip.open(infile,"rt") as f1:
1515
for line in f1 :
16-
line = line.strip()
17-
linelist = line.split("\t")
18-
if linelist[0] != "1":
19-
seqnumber_tup = (int(linelist[0]),linelist[1])
16+
count,seq = line.strip().split("\t")
17+
if count != "1":
18+
seqnumber_tup = (int(count),seq)
2019
seqnumber_list.append(seqnumber_tup)
21-
sortseqnumber_list=sorted(seqnumber_list,key=itemgetter(0,1))
20+
21+
sortseqnumber_list = sorted(seqnumber_list,key=itemgetter(0,1))
22+
2223
with open(outfile,"wt") as out:
23-
for i in range (len(seqnumber_list)):
24+
for item in sortseqnumber_list:
2425
nf = f'{n:09}'
25-
out.write(f'{sortseqnumber_list[i][0]}\t{sortseqnumber_list[i][1]}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
26+
out.write(f'{item[0]}\t{item[1]}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
2627
n += 1
2728

2829
def rename_nonsingleton(infile1,infile2,outfile):
2930
from fasta import fasta_iter
31+
3032
fastadict={}
33+
3134
for ID,seq in fasta_iter(infile1):
3235
fastadict[seq] = ID
33-
out = open(outfile, "w")
34-
with open (infile2) as f1:
35-
for line in f1 :
36-
line = line.strip()
37-
linelist = line.split("\t")
38-
name = fastadict[linelist[1]]
39-
newname = linelist[2]
40-
out.write(f'{name}\t{newname}\n')
41-
out.close()
36+
37+
with open(outfile,'wt') as out:
38+
with open (infile2) as f1:
39+
for line in f1 :
40+
count,seq,newname = line.strip().split("\t")
41+
name = fastadict[seq]
42+
out.write(f'{name}\t{newname}\n')
4243

4344
def rename_singleton(infile1,infile2,outfile,n,prefix):
4445
from fasta import fasta_iter
46+
4547
name=set()
46-
with open (infile1) as f1:
48+
49+
with open(infile1) as f1:
4750
for line in f1 :
48-
line = line.strip()
49-
linelist = line.split("\t")
50-
name.add(linelist[0])
51-
out = open(outfile, "w")
52-
for ID,seq in fasta_iter(infile2):
53-
if ID in name:
54-
nf = f'{n:09}'
55-
out.write(f'{ID}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
56-
n += 1
57-
out.close()
51+
singleton,cluster = line.strip().split("\t")
52+
name.add(singleton)
53+
54+
with open(outfile,'wt') as out:
55+
for ID,seq in fasta_iter(infile2):
56+
if ID in name:
57+
nf = f'{n:09}'
58+
out.write(f'{ID}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
59+
n += 1
5860

5961
INPUT_FILE_1 = "metag_ProG.raw_number.tsv.gz"
6062
INPUT_FILE_2 = "metag_ProG_nonsingleton.faa.gz"
61-
INPUT_FILE_3 = "singleton_0.5_0.9.tsv"
63+
INPUT_FILE_3 = "singleton_0.9.tsv"
6264
INPUT_FILE_4 = "metag_ProG_singleton.faa.gz"
6365
OUTPUT_FILE_1 = "nonsingleton_rename_seq.tsv"
6466
OUTPUT_FILE_2 = "nonsingleton_rename.tsv"

General_Scripts/04_Frozen/02_100AA_faa_fna.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,27 @@
66
def getseq(infile1,infile2,outfile):
77
from fasta import fasta_iter
88
import lzma
9+
910
name = {}
10-
out1 = lzma.open(outfile, "wt")
11+
out = lzma.open(outfile, "wt")
1112

12-
with lzma.open(infile1,"rt") as f1:
13+
with open(infile1,"rt") as f1:
1314
for line in f1:
14-
line = line.strip()
15-
linelist = line.split("\t")
16-
name[linelist[0]] = linelist[1]
15+
old,new = line.strip().split("\t")
16+
name[old] = new
1717

1818
for ID,seq in fasta_iter(infile2):
1919
if ID in name.keys():
20-
out1.write(f'>{name[ID]}\n{seq}\n')
21-
out1.close()
20+
out.write(f'>{name[ID]}\n{seq}\n')
21+
out.close()
2222

23-
INPUT_FILE_1 = "./data/100AA_rename.tsv.xz"
24-
INPUT_FILE_2 = "./data/metag_ProG_dedup.faa.gz"
23+
INPUT_FILE_1 = "100AA_rename.tsv"
24+
INPUT_FILE_2 = "metag_ProG_dedup.faa.gz"
2525
INPUT_FILE_3 = "GMSC10.metag_smorfs.fna.xz"
2626
INPUT_FILE_4 = "GMSC.ProGenomes2.smorfs.fna.xz"
27-
OUTPUT_FILE_1 = "./data/frozen/100AA_GMSC.faa.xz"
28-
OUTPUT_FILE_2 = "./data/frozen/100AA_metag.fna.xz"
29-
OUTPUT_FILE_3 = "./data/frozen/100AA_prog.fna.xz"
27+
OUTPUT_FILE_1 = "100AA_GMSC.faa.xz"
28+
OUTPUT_FILE_2 = "100AA_metag.fna.xz"
29+
OUTPUT_FILE_3 = "100AA_prog.fna.xz"
3030

3131
getseq(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
3232
getseq(INPUT_FILE_1,INPUT_FILE_3,OUTPUT_FILE_2)

General_Scripts/04_Frozen/03_90AA_faa_fna.py

Lines changed: 39 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -11,105 +11,98 @@
1111
'''
1212
Generate original name - 90AA rename list.
1313
Peptides are named: >GMSC10.90AA.XXX_XXX_XXX
14-
Numbers were assigned in order of increasing number of copies.
15-
So that the lower the number, the lower the number of copies of that peptide was present in the input data.
16-
And if the number of copies is same, numbers were assigned in order of letters of peptides.
1714
'''
1815
def rename(infile1,infile2,outfile,n,prefix):
1916
number = {}
20-
seqnumber_list=[]
17+
seqnumber_list = []
2118

2219
with gzip.open(infile2,"rt") as f2:
2320
for line in f2 :
24-
line = line.strip()
25-
linelist = line.split("\t")
26-
if linelist[0] in number.keys():
27-
number[linelist[0]] += 1
21+
cluster,member = line.strip().split("\t")
22+
if cluster in number.keys():
23+
number[cluster] += 1
2824
else:
29-
number[linelist[0]] = 1
25+
number[cluster] = 1
3026

3127
for ID,seq in fasta_iter(infile1):
3228
seqnumber_tup = (int(number[ID]),seq,ID)
3329
seqnumber_list.append(seqnumber_tup)
3430

35-
sortseqnumber_list=sorted(seqnumber_list,key=itemgetter(0,1))
31+
sortseqnumber_list = sorted(seqnumber_list,key=itemgetter(0,1))
3632
with lzma.open(outfile,"wt") as out:
37-
for i in range (len(sortseqnumber_list)):
33+
for item in sortseqnumber_list:
3834
nf = f'{n:09}'
39-
out.write(f'{sortseqnumber_list[i][2]}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
40-
n += 1
35+
out.write(f'{item[2]}\t{prefix}.{nf[:3]}_{nf[3:6]}_{nf[6:9]}\n')
36+
n += 1
4137

4238
'''
43-
Generate originalname - 100AA - 90AA rename list.
39+
Generate original name - 100AA - 90AA rename list.
4440
'''
4541
def rename_all(infile1,infile2,outfile):
4642
name = {}
4743
out1 = lzma.open(outfile, "wt")
4844

4945
with lzma.open(infile1,"rt") as f1:
5046
for line in f1:
51-
line = line.strip()
52-
linelist = line.split("\t")
53-
name[linelist[0]] = linelist[1]
47+
old,new = line.strip().split("\t")
48+
name[old] = new
5449

5550
with gzip.open(infile2,"rt") as f2:
5651
for line in f2:
57-
line = line.strip().strip(">")
58-
linelist = line.split("\t")
59-
if linelist[0] in name.keys():
60-
out1.write(linelist[0]+"\t"+linelist[1]+"\t"+name[linelist[0]]+"\n")
61-
out1.close()
52+
old,new = line.strip().split("\t")
53+
if old in name.keys():
54+
out1.write(f'{old}\t{new}\t{name[new]}\n')
55+
out1.close()
6256

6357
'''
6458
Generate rename and sequence of 90AA faa.
6559
'''
6660
def getfaa(infile1,infile2,outfile):
6761
name = {}
68-
out1 = lzma.open(outfile, "wt")
62+
out = lzma.open(outfile, "wt")
6963

7064
with lzma.open(infile1,"rt") as f1:
7165
for line in f1:
72-
line = line.strip()
73-
linelist = line.split("\t")
74-
name[linelist[0]] = linelist[1]
75-
76-
66+
old,new = line.strip().split("\t")
67+
name[old] = new
68+
7769
for ID,seq in fasta_iter(infile2):
78-
out1.write(f'>{name[ID]}\n{seq}\n')
79-
out1.close()
70+
out.write(f'>{name[ID]}\n{seq}\n')
71+
out.close()
8072

8173
'''
8274
Generate rename and sequence of 90AA fna.
8375
'''
8476
def getfna(infile1,infile2,outfile):
8577
fasta = {}
8678
table = {}
87-
out1 = lzma.open(outfile, "wt")
79+
80+
out = lzma.open(outfile, "wt")
81+
8882
with lzma.open(infile1,"rt") as f1:
8983
for line in f1:
90-
line = line.strip()
91-
linelist = line.split("\t")
92-
table[linelist[1]] = linelist[2]
84+
old,name100,name90 = line.strip().split("\t")
85+
table[name100] = name90
9386

9487
for ID,seq in fasta_iter(infile2):
9588
if ID in table.keys():
9689
fasta[table[ID]] = seq
97-
table = {}
90+
9891
for ID,seq in sorted(fasta.items()):
99-
out1.write(f">{ID}\n{seq}\n")
100-
out1.close()
92+
out.write(f">{ID}\n{seq}\n")
93+
out.close()
10194

102-
INPUT_FILE_1 = "./clust_result/0.5_result/metag_ProG_nonsingleton_0.9_clu_rep.faa.gz"
103-
INPUT_FILE_2 = "./clust_result/0.5_result/metag_ProG_nonsingleton_0.9_clu.tsv.gz"
104-
INPUT_FILE_3 = "./data/100AA_rename.tsv.xz"
105-
INPUT_FILE_4 = "./data/frozen/100AA_GMSC.fna.xz"
95+
INPUT_FILE_1 = "metag_ProG_nonsingleton_0.9_clu_rep.faa.gz"
96+
INPUT_FILE_2 = "metag_ProG_nonsingleton_0.9_clu.tsv.gz"
97+
INPUT_FILE_3 = "100AA_rename.tsv.xz"
98+
INPUT_FILE_4 = "100AA_GMSC.fna.xz"
10699

107-
OUTPUT_FILE_1 = "./data/frozen/90AA_rename.tsv.xz"
108-
OUTPUT_FILE_2 = "./data/frozen/90AA_rename_all.tsv.xz"
109-
OUTPUT_FILE_3 = "./data/frozen/90AA_GMSC.faa.xz"
110-
OUTPUT_FILE_4 = "./data/frozen/90AA_GMSC.fna.xz"
100+
OUTPUT_FILE_1 = "90AA_rename.tsv.xz"
101+
OUTPUT_FILE_2 = "90AA_rename_all.tsv.xz"
102+
OUTPUT_FILE_3 = "90AA_GMSC.faa.xz"
103+
OUTPUT_FILE_4 = "90AA_GMSC.fna.xz"
111104

112105
rename(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1,0,'GMSC10.90AA')
113106
rename_all(OUTPUT_FILE_1,INPUT_FILE_3,OUTPUT_FILE_2)
114107
getfaa(OUTPUT_FILE_1,INPUT_FILE_1,OUTPUT_FILE_3)
115-
getfna(OUTPUT_FILE_2,INPUT_FILE_4,OUTPUT_FILE_4 )
108+
getfna(OUTPUT_FILE_2,INPUT_FILE_4,OUTPUT_FILE_4)
Lines changed: 23 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,36 @@
11
'''
22
Concept:
3-
Generate the table including the name of smORFs, and the clusters they belong to at 90% identity.
3+
Generate the table including the 100AA smORFs and 90AA families.
44
'''
55

66
def generate_family(infile1,infile2,infile3,outfile):
77
import lzma
88
import gzip
9-
name50 = {}
9+
1010
name90 = {}
11-
out1 = lzma.open(outfile, "wt")
12-
13-
with lzma.open(infile1,"rt") as f1:
14-
for line in f1:
15-
line = line.strip()
16-
linelist = line.split("\t")
17-
name50[linelist[1]] = linelist[2]
18-
19-
with lzma.open(infile2,"rt") as f2:
20-
for line in f2:
21-
line = line.strip()
22-
linelist = line.split("\t")
23-
name90[linelist[1]] = linelist[2]
11+
out = open(outfile, "wt")
2412

25-
with gzip.open(infile3,"rt") as f3:
26-
for line in f3:
27-
line = line.strip()
28-
linelist = line.split("\t")
29-
if len(linelist) == 2:
30-
out1.write(linelist[0]+"\t"+""+"\t"+name50[linelist[1]]+"\n")
31-
elif len(linelist) == 3 and linelist[1] != "":
32-
out1.write(linelist[0]+"\t"+name90[linelist[2]]+"\t"+name50[linelist[1]]+"\n")
33-
else:
34-
out1.write(linelist[0]+"\t"+name90[linelist[2]]+"\n")
35-
out1.close()
13+
with lzma.open(infile1,"rt") as f:
14+
for line in f:
15+
old,new100,new90 = line.strip().split("\t")
16+
name90[new100] = new90
17+
18+
name100 = {}
19+
with open(infile2,'rt') as f:
20+
for line in f:
21+
old,new = line.strip().split("\t")
22+
name100[old] = new
23+
24+
with gzip.open(infile3,"rt") as f:
25+
for line in f:
26+
cluster,member = line.strip().split("\t")
27+
out.write(f'{name100[member]}\t{name90[cluster]}\n')
28+
out.close()
3629

37-
INPUT_FILE_1 = "./data/frozen/50AA_rename_all.tsv.xz"
38-
INPUT_FILE_2 = "./data/frozen/90AA_rename_all.tsv.xz"
39-
INPUT_FILE_3 = "./clust_result/result/all_0.5_0.9_rename.tsv.gz"
40-
OUTPUT_FILE = "./data/frozen/all_0.9_0.5_family.tsv.xz"
30+
INPUT_FILE_1 = "90AA_rename_all.tsv.xz"
31+
INPUT_FILE_2 = "100AA_rename.tsv"
32+
INPUT_FILE_3 = "all_0.9.tsv.gz"
33+
OUTPUT_FILE = "GMSC.cluster.tsv"
4134

4235
generate_family(INPUT_FILE_1,INPUT_FILE_2,INPUT_FILE_3,OUTPUT_FILE)
4336

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
# 04_Frozen
1+
## 04_Frozen
2+
23
| **Code** | **Description** | **Input** | **Output** |
34
| :---: | :---: | :---: | :---: |
4-
| 01_rename_list.py | Rename 100AA sequences | metag_ProG.raw_number.tsv.gz metag_ProG_nonsingleton.faa.gz singleton_0.5_0.9.tsv metag_ProG_singleton.faa.gz| nonsingleton_rename.tsv singleton_rename.tsv |
5-
| 02_100AA_faa_fna.py | Generate 100AA faa and fna file with new identifier | 100AA_rename.tsv.xz metag_ProG_dedup.faa.gz GMSC10.metag_smorfs.fna.xz GMSC.ProGenomes2.smorfs.fna.xz | 100AA_GMSC.faa.xz 100AA_metag.fna.xz 100AA_prog.fna.xz |
5+
| 01_rename_list.py | Rename 100AA sequences | metag_ProG.raw_number.tsv.gz metag_ProG_nonsingleton.faa.gz singleton_0.9.tsv metag_ProG_singleton.faa.gz| nonsingleton_rename.tsv singleton_rename.tsv |
6+
| 02_100AA_faa_fna.py | Generate 100AA faa and fna file with new identifier | 100AA_rename.tsv metag_ProG_dedup.faa.gz GMSC10.metag_smorfs.fna.xz GMSC.ProGenomes2.smorfs.fna.xz | 100AA_GMSC.faa.xz 100AA_metag.fna.xz 100AA_prog.fna.xz |
67
| 03_90AA_faa_fna.py | Rename 90AA sequences and generate 90AA faa and fna file with new identifier | metag_ProG_nonsingleton_0.9_clu_rep.faa.gz metag_ProG_nonsingleton_0.9_clu.tsv.gz 100AA_rename.tsv.xz 100AA_GMSC.fna.xz | 90AA_rename.tsv.xz 90AA_rename_all.tsv.xz 90AA_GMSC.faa.xz 90AA_GMSC.fna.xz |
7-
| 04_family.py | Generate the cluster table | 90AA_rename_all.tsv.xz all_0.5_0.9_rename.tsv.gz | all_0.9_0.5_family.tsv.xz |
8+
| 04_family.py | Generate the family table | 90AA_rename_all.tsv.xz 100AA_rename.tsv all_0.9.tsv.gz | GMSC.cluster.tsv |

0 commit comments

Comments
 (0)