Skip to content

Commit ee78a1f

Browse files
committed
ENH update 03_Quality_control
1 parent b25482a commit ee78a1f

File tree

10 files changed

+329
-41
lines changed

10 files changed

+329
-41
lines changed

General_Scripts/03_Quality_control/RNAcode/04_filter_RNAcode.py

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,14 @@ def filter(file_dir,outfile):
77
import os
88
out = open(outfile, "wt")
99
for n in range(1,288):
10-
print("first"+str(n)+"\n")
11-
first_dir = file_dir+"/first"+str(n)
10+
first_dir = f'{file_dir}/first{n}'
1211
for m in range(1,301):
13-
print("second"+str(m)+"\n")
14-
second_dir = first_dir+"/second"+str(m)
12+
second_dir = f'{first_dir}/second{m}'
1513
if os.listdir(second_dir):
1614
for infile in os.listdir(second_dir):
17-
file_path = second_dir+"/"+infile
18-
with open (file_path) as f1:
19-
for line in f1 :
15+
file_path = f'{second_dir}/{infile}'
16+
with open(file_path) as f:
17+
for line in f:
2018
linelist = line.strip().split("\t")
2119
if float(linelist[-1]) < 0.05:
2220
filesplit = infile.split(".")
@@ -50,6 +48,56 @@ def true_false_100AA_90AA(infile1,infile2,outfile1,outfile2,outfile3):
5048
out2.close()
5149
out3.close()
5250

51+
def file_name(file_dir,outfile):
52+
import os
53+
54+
out = open(outfile, "w")
55+
for n in range(1,288):
56+
first_dir = f'{file_dir}/first{n}'
57+
for m in range(1,301):
58+
second_dir = f'{first_dir}/second{m}'
59+
for infile in os.listdir(second_dir):
60+
file_path = f'{second_dir}/{infile}'
61+
name = infile.replace('.fna.aln.tsv','')
62+
with open(file_path) as f:
63+
linelist = f.readline().strip().split('\t')
64+
if len(linelist) >1:
65+
out.write(f'{name}\t{linelist[10]}\n')
66+
out.close()
67+
68+
def full_90(infile,outfile):
69+
metaT = {}
70+
with open(infile,'rt') as f:
71+
for line in f:
72+
cluster,number = line.strip().split('\t')
73+
metaT[cluster] = number
74+
75+
with open(outfile,'wt') as out:
76+
for i in range(287926875):
77+
nf = f'{i:09}'
78+
name = f'GMSC10.90AA.{nf[:3]}_{nf[3:6]}_{nf[6:9]}'
79+
if name in metaT.keys():
80+
out.write(f'{name}\t{metaT[name]}\n')
81+
else:
82+
out.write(f'{name}\tNA\n')
83+
84+
def full_100(infile1,infile2,outfile):
85+
import gzip
86+
87+
rnacode = {}
88+
with open(infile1,'rt') as f:
89+
for line in f:
90+
cluster,number = line.strip().split('\t')
91+
rnacode[cluster] = number
92+
93+
with open(outfile,'wt') as out:
94+
with gzip.open(infile2,'rt') as f:
95+
for line in f:
96+
member,cluster = line.strip().split('\t')
97+
if cluster in rnacode.keys():
98+
out.write(f'{member}\t{rnacode[cluster]}\n')
99+
else:
100+
out.write(f'{member}\tNA\n')
53101

54102
INPUT_DIR = "./rnacode"
55103
INPUT_FILE_1 = "GMSC.cluster_filter.tsv"
@@ -59,4 +107,14 @@ def true_false_100AA_90AA(infile1,infile2,outfile1,outfile2,outfile3):
59107
OUTPUT_FILE_4 = "rnacode_false_90AA.tsv"
60108

61109
filter(INPUT_DIR,OUTPUT_FILE_1)
62-
true_false_100AA_90AA(OUTPUT_FILE_1,INPUT_FILE_1,OUTPUT_FILE_2,OUTPUT_FILE_3,OUTPUT_FILE_4)
110+
true_false_100AA_90AA(OUTPUT_FILE_1,INPUT_FILE_1,OUTPUT_FILE_2,OUTPUT_FILE_3,OUTPUT_FILE_4)
111+
112+
OUTPUT_FILE_5 = "90AA_RNAcode_p.tsv"
113+
file_name(INPUT_DIR,OUTPUT_FILE_5)
114+
115+
OUTPUT_FILE_6 = '90AA_RNAcode.tsv'
116+
full_90(OUTPUT_FILE_5,OUTPUT_FILE_6)
117+
118+
INPUT_FILE_3 = 'GMSC.cluster.tsv.gz'
119+
OUTPUT_FILE_7 = '100AA_RNAcode.tsv'
120+
full_100(INPUT_FILE_2,INPUT_FILE_3,OUTPUT_FILE_7)

General_Scripts/03_Quality_control/Readme.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,35 @@
1919
| 01_filter8_addfna_split.py | Select clusters(>= 8 members) | GMSC.cluster.tsv.gz metag_ProG_smorfs.fna.xz | ./split/*.fna |
2020
| 02_run_MSA.sh | Multiple sequences alignment of each .fna file | ./split/*.fna | *.aln |
2121
| 03_run_RNAcode.sh | Run RNAcode | *.aln | *.tsv |
22-
| 04_filter_RNAcode.py | Filter RNAcode result | *.tsv GMSC.cluster_filter.tsv| rnacode_true_90AA.tsv rnacode_true_100AA.tsv rnacode_false_100AA.tsv rnacode_false_90AA.tsv |
22+
| 04_filter_RNAcode.py | Filter RNAcode result | *.tsv GMSC.cluster_filter.tsv GMSC.cluster.tsv.gz | rnacode_true_90AA.tsv rnacode_true_100AA.tsv rnacode_false_100AA.tsv rnacode_false_90AA.tsv 90AA_RNAcode.tsv 100AA_RNAcode.tsv |
2323

2424
### metatranscriptomics
2525

2626
| **Code** | **Description** | **Input** | **Output** |
2727
| :---: | :---: | :---: | :---: |
2828
| 01_run_bwa_ngless.sh | Map metatranscriptome reads to smORFs | 90AA_GMSC.fna *.fastq.gz | *.tsv |
29-
| 02_merge_filter.py | Merge and filter mapping results | *.tsv GMSC.cluster.tsv.gz | metaT_result.tsv metaT_90AA.tsv metaT_100AA.tsv |
29+
| 02_merge_filter.py | Merge and filter mapping results | *.tsv GMSC.cluster.tsv.gz | metaT_result.tsv metaT_90AA.tsv metaT_100AA.tsv 90AA_metaT.tsv 100AA_metaT.tsv|
3030

3131
### riboseq
3232

3333
| **Code** | **Description** | **Input** | **Output** |
3434
| :---: | :---: | :---: | :---: |
3535
| 01_run_bwa_ngless.sh | Map riboseq reads to smORFs | 90AA_GMSC.fna *.fastq.gz | *.tsv |
36-
| 02_merge_filter.py | Merge and filter mapping results | *.tsv GMSC.cluster.tsv.gz | riboseq_result.tsv riboseq_90AA.tsv riboseq_100AA.tsv |
36+
| 02_merge_filter.py | Merge and filter mapping results | *.tsv GMSC.cluster.tsv.gz | riboseq_result.tsv riboseq_90AA.tsv riboseq_100AA.tsv 90AA_RiboSeq.tsv 100AA_RiboSeq.tsv |
3737

3838
### metaproteomics
3939

4040
| **Code** | **Description** | **Input** | **Output** |
4141
| :---: | :---: | :---: | :---: |
4242
| 00_split_100AA.py 01_map.py | For each metaproteomes peptides from each project in PRIDE,find their exact match against 100AA smORFs | 100AA_GMSC.faa.xz *.fasta | *.tsv |
43-
| 02_merge.py | Calculate and filter peptide coverage rate of each smORF | *.tsv | coverage_analysis.tsv |
44-
| 03_assign_all_level.py | Assign results to 90AA smORFs | coverage_analysis.tsv GMSC.cluster.tsv.gz | metaP_90AA.tsv.gz |
43+
| 02_merge.py | Calculate and filter peptide coverage rate of each smORF | *.tsv | coverage_analysis.tsv 100AA_metaP.tsv |
44+
| 03_assign_all_level.py | Assign results to 90AA smORFs | coverage_analysis.tsv GMSC.cluster.tsv.gz | metaP_90AA.tsv.gz 100AA_metaP_all.tsv 90AA_metaP.tsv |
4545

4646

4747
### merge_quality_control
4848

4949
| **Code** | **Description** | **Input** | **Output** |
5050
| :---: | :---: | :---: | :---: |
51-
| 01_merge.py | Merge all the quality control results | 100AA_rename.tsv.xz rnacode_true_100AA.tsv.xz rnacode_false_100AA.tsv.xz antifam_result.tsv coverage_analysis.tsv.gz riboseq_100AA.tsv.gz 100AA_coordinate.tsv.gz metaT_100AA.tsv.gz | allquality_100AA.tsv.gz allpass_100AA.txt |
52-
| 02_statistic.py | Merge all the quality control results | 100AA_rename.tsv.xz rnacode_true_100AA.tsv.xz rnacode_false_100AA.tsv.xz antifam_result.tsv coverage_analysis.tsv.gz riboseq_100AA.tsv.gz 100AA_coordinate.tsv.gz metaT_100AA.tsv.gz | allquality_100AA.tsv.gz allpass_100AA.txt |
51+
| 01_merge.py | Merge all the quality control results | GMSC.cluster.tsv.gz rnacode_true_100AA.tsv.xz rnacode_false_100AA.tsv.xz antifam_result.tsv coverage_analysis.tsv.gz riboseq_100AA.tsv.gz 100AA_coordinate.tsv.gz metaT_100AA.tsv.gz rnacode_true_90AA.tsv.xz rnacode_false_90AA.tsv.xz antifam_90AA.tsv metaP_90AA.tsv.gz riboseq_90AA.tsv.gz 90AA_coordinate.tsv.gz metaT_90AA.tsv.gz | GMSC10.100AA.quality.tsv.xz GMSC10.90AA.quality.tsv.xz allpass_100AA.txt allpass_90AA.txt |
52+
| 02_statistic.py | Merge all the quality control results | GMSC.cluster.tsv.gz rnacode_true_100AA.tsv.xz rnacode_false_100AA.tsv.xz antifam_result.tsv coverage_analysis.tsv.gz riboseq_100AA.tsv.gz 100AA_coordinate.tsv.gz metaT_100AA.tsv.gz | allquality_100AA.tsv.gz allpass_100AA.txt |
53+
| 03_merge_all.py | Merge all the values of quality control results | GMSC10.100AA.quality.tsv.xz 100AA_RNAcode.tsv 100AA_metaT.tsv 100AA_RiboSeq.tsv 100AA_metaP_all.tsv GMSC10.90AA.quality.tsv.xz 90AA_RNAcode.tsv 90AA_metaT.tsv 90AA_RiboSeq.tsv 90AA_metaP.tsv | GMSC10.100AA.quality.tsv.xz GMSC10.90AA.quality.tsv.xz allpass_100AA.txt allpass_90AA.txt GMSC10.100AA.quality_test.tsv GMSC10.90AA.quality_test.tsv |

General_Scripts/03_Quality_control/merge_quality_control/01_merge.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,29 @@
33
If it pass all the computational checking(Antifam,RNAcode,coordinate),
44
and has at least 1 experimental evidence(Metaproteomes,metatranstomes,riboseq),then it will be high quality.
55
'''
6+
def store_100(infile):
7+
import lzma
8+
smorf_100 = {}
9+
with lzma.open(infile,"rt") as f1:
10+
for line in f1:
11+
member,cluster = line.strip().split("\t")
12+
smorf_100[member] = ["NA","T","F","F","NA","F"]
13+
return smorf_100
14+
15+
def store_90(infile):
16+
import lzma
17+
smorf_90 = {}
18+
with lzma.open(infile,"rt") as f1:
19+
for line in f1:
20+
member,cluster = line.strip().split("\t")
21+
smorf_90[cluster] = ["NA","T","F","F","NA","F"]
22+
return smorf_90
623

7-
def merge(infile1,infile2,infile3,infile4,infile5,infile6,infile7,infile8,outfile):
24+
def merge(smorf,infile2,infile3,infile4,infile5,infile6,infile7,infile8,outfile):
825
import lzma
926
import gzip
1027

1128
out = lzma.open(outfile, "wt")
12-
smorf = {}
13-
14-
with lzma.open (infile1,"rt") as f1:
15-
for line in f1:
16-
linelist = line.strip().split("\t")
17-
smorf[linelist[0]] = ["NA","T","F","F","NA","F"]
1829

1930
with lzma.open(infile2,"rt") as f2:
2031
for line in f2:
@@ -78,19 +89,22 @@ def allpass(infile,outfile):
7889
OUTPUT_FILE_1 = "GMSC10.100AA.quality.tsv.xz"
7990
OUTPUT_FILE_2 = "allpass_100AA.txt"
8091

81-
merge(INPUT_FILE_1,INPUT_FILE_2,INPUT_FILE_3,INPUT_FILE_4,INPUT_FILE_5,INPUT_FILE_6,INPUT_FILE_7,INPUT_FILE_8,OUTPUT_FILE_1)
92+
smorf_100 = store_100(INPUT_FILE_1)
93+
merge(smorf_100,INPUT_FILE_2,INPUT_FILE_3,INPUT_FILE_4,INPUT_FILE_5,INPUT_FILE_6,INPUT_FILE_7,INPUT_FILE_8,OUTPUT_FILE_1)
8294
allpass(OUTPUT_FILE_1,OUTPUT_FILE_2)
8395

96+
#90AA
8497
INPUT_FILE_1 = "GMSC.cluster.tsv.gz"
8598
INPUT_FILE_2 = "rnacode_true_90AA.tsv"
8699
INPUT_FILE_3 = "rnacode_false_90AA.tsv"
87-
INPUT_FILE_4 = "antifam_90AA.tsv.gz"
100+
INPUT_FILE_4 = "antifam_90AA.tsv"
88101
INPUT_FILE_5 = "metaP_90AA.tsv.gz"
89102
INPUT_FILE_6 = "riboseq_90AA.tsv"
90103
INPUT_FILE_7 = "90AA_coordinate.tsv.gz"
91104
INPUT_FILE_8 = "metaT_90AA.tsv"
92105
OUTPUT_FILE_1 = "GMSC10.90AA.quality.tsv.xz"
93106
OUTPUT_FILE_2 = "allpass_90AA.txt"
94107

95-
merge(INPUT_FILE_1,INPUT_FILE_2,INPUT_FILE_3,INPUT_FILE_4,INPUT_FILE_5,INPUT_FILE_6,INPUT_FILE_7,INPUT_FILE_8,OUTPUT_FILE_1)
108+
smorf_90 = store_90(INPUT_FILE_1)
109+
merge(smorf_90,INPUT_FILE_2,INPUT_FILE_3,INPUT_FILE_4,INPUT_FILE_5,INPUT_FILE_6,INPUT_FILE_7,INPUT_FILE_8,OUTPUT_FILE_1)
96110
allpass(OUTPUT_FILE_1,OUTPUT_FILE_2)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
def merge(number,n,infile1,infile2,infile3,infile4,infile5,outfile):
2+
import lzma
3+
4+
antifam = {}
5+
terminal = {}
6+
rnacode = {}
7+
metat = {}
8+
riboseq = {}
9+
metap = {}
10+
11+
with lzma.open(infile1,'rt') as f:
12+
for line in f:
13+
if line.startswith('#'):
14+
continue
15+
else:
16+
linelist = line.strip().split('\t')
17+
antifam[linelist[0]] = linelist[2]
18+
terminal[linelist[0]] = linelist[5]
19+
20+
with open(infile2,'rt') as f:
21+
for line in f:
22+
cluster,number = line.strip().split('\t')
23+
rnacode[cluster] = number
24+
25+
with open(infile3,'rt') as f:
26+
for line in f:
27+
cluster,number = line.strip().split('\t')
28+
metat[cluster] = number
29+
30+
with open(infile4,'rt') as f:
31+
for line in f:
32+
cluster,number = line.strip().split('\t')
33+
riboseq[cluster] = number
34+
35+
with open(infile5,'rt') as f:
36+
for line in f:
37+
cluster,number = line.strip().split('\t')
38+
metap[cluster] = number
39+
40+
with open(outfile,'wt') as out:
41+
out.write(f'AntiFam\tTerminal checking\tRNAcode\tmetaTranscriptome\tRiboseq\tmetaProteome\n')
42+
for i in range(number):
43+
nf = f'{i:09}'
44+
name = f'GMSC10.{n}AA.{nf[:3]}_{nf[3:6]}_{nf[6:9]}'
45+
out.write(f'{antifam[name]}\t{terminal[name]}\t{rnacode[name]}\t{metat[name]}\t{riboseq[name]}\t{metap[name]}\n')
46+
47+
NUMBER_100 = 964970496
48+
NUMBER_90 = 287926875
49+
50+
infile1 = 'GMSC10.100AA.quality.tsv.xz'
51+
infile2 = '100AA_RNAcode.tsv'
52+
infile3 = '100AA_metaT.tsv'
53+
infile4 = '100AA_RiboSeq.tsv'
54+
infile5 = '100AA_metaP_all.tsv'
55+
outfile = 'GMSC10.100AA.quality_test.tsv'
56+
merge(NUMBER_100,100,infile1,infile2,infile3,infile4,infile5,outfile)
57+
58+
infile1 = 'GMSC10.90AA.quality.tsv.xz'
59+
infile2 = '90AA_RNAcode.tsv'
60+
infile3 = '90AA_metaT.tsv'
61+
infile4 = '90AA_RiboSeq.tsv'
62+
infile5 = '90AA_metaP.tsv'
63+
outfile = 'GMSC10.90AA.quality_test.tsv'
64+
merge(NUMBER_90,90,infile1,infile2,infile3,infile4,infile5,outfile)

General_Scripts/03_Quality_control/metaproteomics/02_merge.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,29 @@ def processfile_cov(infile):
7474
df = pd.DataFrame(out,columns=['Access','Coverage','QualityString'])
7575
return df.sort_values('Access')
7676

77+
import re
78+
79+
def processfile_cov_all(infile):
80+
with open(infile, 'rt') as db:
81+
out = []
82+
for row in db:
83+
smorf, seq, substr_lst = row.strip().split('\t')
84+
for s in ['[',']',"'",' ']:
85+
substr_lst = substr_lst.replace(s, '')
86+
substr_lst = substr_lst.split(',')
87+
if (smorf != 'query') and (seq != 'Sequence'):
88+
cov, qualstr, _ = covcalc(seq, substr_lst)
89+
out.append([smorf, cov])
90+
df = pd.DataFrame(out,columns=['Access','Coverage'])
91+
return df.sort_values('Access')
92+
7793
if __name__ == '__main__':
7894
folder = "./map_result"
7995
ofile = "merged_output.tsv"
8096
mergeall(folder,ofile)
8197
# properly calculating the coverage per peptide
8298
df = processfile_cov(ofile)
8399
# saving final results
84-
df.to_csv('coverage_analysis.tsv',sep='\t', header=True, index=None)
100+
df.to_csv('coverage_analysis.tsv',sep='\t', header=True, index=None)
101+
df = processfile_cov_all(ofile)
102+
df.to_csv('100AA_metaP.tsv',sep='\t', header=True, index=None)

General_Scripts/03_Quality_control/metaproteomics/03_assign_all_level.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
'''
55

66
def assign(infile1,infile2,outfile1,outfile2):
7-
import lzma
87
import gzip
98

109
out1 = gzip.open(outfile1, "wt", compresslevel=1)
@@ -39,9 +38,54 @@ def assign(infile1,infile2,outfile1,outfile2):
3938
out2.write(f'{linelist[0]}\n')
4039
out2.close()
4140

41+
def full_100(infile1,outfile):
42+
metaP = {}
43+
with open(infile1,'rt') as f:
44+
for line in f:
45+
cluster,number = line.strip().split('\t')
46+
metaP[cluster] = number
47+
48+
with open(outfile,'wt') as out:
49+
for i in range(964970496):
50+
nf = f'{i:09}'
51+
name = f'GMSC10.100AA.{nf[:3]}_{nf[3:6]}_{nf[6:9]}'
52+
if name in metaP.keys():
53+
out.write(f'{name}\t{metaP[name]}\n')
54+
else:
55+
out.write(f'{name}\t0\n')
56+
57+
def full_90(infile1,infile2,outfile):
58+
import gzip
59+
60+
metaP = {}
61+
with open(infile1,'rt') as f:
62+
for line in f:
63+
member,number = line.strip().split('\t')
64+
metaP[member] = float(number)
65+
66+
cluster_dict = {}
67+
with gzip.open(infile2,'rt') as f:
68+
for line in f:
69+
member,cluster = line.strip().split('\t')
70+
if cluster not in cluster_dict.keys():
71+
cluster_dict[cluster] = [metaP[member]]
72+
else:
73+
if metaP[member] != 0:
74+
cluster_dict[cluster].append(metaP[member])
75+
with open(outfile,'wt') as out:
76+
for key,value in sorted(cluster_dict.items()):
77+
p = max(value)
78+
out.write(f'{key}\t{p}\n')
79+
4280
INPUT_FILE_1 = "coverage_analysis.tsv.gz"
4381
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
4482
OUTPUT_FILE_1 = "90AA_F_T_rate.tsv.gz"
4583
OUTPUT_FILE_2 = "metaP_90AA.tsv.gz"
84+
assign(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)
85+
86+
INPUT_FILE_3 = '100AA_metaP.tsv'
87+
OUTPUT_FILE_3 = '100AA_metaP_all.tsv'
88+
full_100(INPUT_FILE_3,OUTPUT_FILE_3)
4689

47-
assign(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)
90+
OUTPUT_FILE_4 = '90AA_metaP.tsv'
91+
full_90(OUTPUT_FILE_3,INPUT_FILE_2,OUTPUT_FILE_4)

0 commit comments

Comments
 (0)