Skip to content

Commit e42f4b6

Browse files
committed
ENH update 03_Quality_control
1 parent e2a06e8 commit e42f4b6

23 files changed

+323
-505
lines changed

General_Scripts/03_Quality_control/Antifam/01_run_antifam.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
set -e
77
set -o pipefail
88

9-
hmmpress ~/software/antifam/AntiFam.hmm
9+
hmmpress AntiFam.hmm
1010

11-
hmmsearch --cut_ga --tblout ~/antifam/antifam_result_100.tsv ~/software/antifam/AntiFam.hmm ./data/frozen/100AA_GMSC_sort.faa
11+
hmmsearch --cut_ga --tblout antifam_result_100.tsv AntiFam.hmm 100AA_GMSC.faa
1212

1313
awk 'NR>3&&NR<4031131&&$5 <= 0.00001' antifam_result_100.tsv > antifam_result_100.tsv.tmp
1414

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
'''
2+
Concept:
23
Generate antifam results to 90AA.
34
If there is at least 1 smORF mapped to antifam in 90AA cluster,the cluster will be spurious.
45
'''
56

67
def assign(infile1,infile2,outfile1,outfile2):
7-
import lzma
88
import gzip
99

1010
out1 = gzip.open(outfile1, "wt", compresslevel=1)
@@ -18,38 +18,30 @@ def assign(infile1,infile2,outfile1,outfile2):
1818
line = line.strip()
1919
smorf.add(line)
2020

21-
with lzma.open(infile2,"rt") as f2:
21+
with gzip.open(infile2,"rt") as f2:
2222
for line in f2:
23-
line = line.strip()
24-
linelist = line.split("\t")
25-
if linelist[1] != "":
26-
if linelist[1] in cluster_90.keys():
27-
if linelist[0] in smorf:
28-
cluster_90[linelist[1]][0] += 1
29-
else:
30-
cluster_90[linelist[1]][1] += 1
31-
else:
32-
cluster_90[linelist[1]] = [0,0]
33-
if linelist[0] in smorf:
34-
cluster_90[linelist[1]][0] += 1
35-
else:
36-
cluster_90[linelist[1]][1] += 1
23+
member,cluster = line.strip().split("\t")
24+
if cluster not in cluster_90.keys():
25+
cluster_90[cluster] = [0,0]
26+
if member in smorf:
27+
cluster_90[cluster][0] += 1
28+
else:
29+
cluster_90[cluster][1] += 1
3730

3831
for key,value in cluster_90.items():
39-
out1.write(key+"\t"+str(value[0])+"\t"+str(value[1])+"\t"+str(value[0]/(value[0]+value[1]))+"\n")
32+
out1.write(f'{key}\t{value[0]}\t{value[1]}\t{value[0]/(value[0]+value[1])}\n')
4033
out1.close()
4134

4235
with gzip.open (outfile1,"rt") as f3:
4336
for line in f3:
44-
line = line.strip()
45-
linelist = line.split("\t")
37+
linelist = line.strip().split("\t")
4638
if float(linelist[3]) > 0:
47-
out2.write(linelist[0]+"\n")
39+
out2.write(f'{linelist[0]}\n')
4840
out2.close()
4941

50-
INPUT_FILE_1 = "./antifam/antifam_result.tsv"
51-
INPUT_FILE_2 = "./data/frozen/all_0.9_0.5_family.tsv.xz"
52-
OUTPUT_FILE_1 = "./antifam/90AA_F_T_rate.tsv.gz"
53-
OUTPUT_FILE_2 = "./antifam/antifam_90AA.tsv.gz"
42+
INPUT_FILE_1 = "antifam_result.tsv"
43+
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
44+
OUTPUT_FILE_1 = "90AA_F_T_rate.tsv.gz"
45+
OUTPUT_FILE_2 = "antifam_90AA.tsv.gz"
5446

5547
assign(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1,OUTPUT_FILE_2)

General_Scripts/03_Quality_control/Coordinates/01_allcoordinate.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ def detect_sequence(sitelist,seq,out):
6161
tf = 1
6262
break
6363
if tf == 0:
64-
out.write(GMSC+"\t"+"F"+"\n")
64+
out.write(f'{GMSC}\tF\n')
6565
else:
66-
out.write(GMSC+"\t"+"T"+"\n")
66+
out.write(f'{GMSC}\tT\n')
6767
tf = 0
6868
else:
6969
seq = complement(seq)
@@ -73,9 +73,9 @@ def detect_sequence(sitelist,seq,out):
7373
tf = 1
7474
break
7575
if tf == 0:
76-
out.write(GMSC+"\t"+"F"+"\n")
76+
out.write(f'{GMSC}\tF\n')
7777
else:
78-
out.write(GMSC+"\t"+"T"+"\n")
78+
out.write(f'{GMSC}\tT\n')
7979
tf = 0
8080

8181
def detect_contigdict(contigdict,seqdict,out):
@@ -103,23 +103,23 @@ def detect_contigdict(contigdict,seqdict,out):
103103
for i in range(0,len(value)):
104104
if value[i][3] == 1:
105105
if len(flagr) == 3:
106-
out.write(value[i][0]+"\t"+"T"+"\n")
106+
out.write(f'{value[i][0]}\tT\n')
107107
else:
108108
if value[i][1] % 3 not in flagp:
109109
flagp.add(value[i][1] % 3)
110110
detect_sequence(value[i],seq,out)
111111
else:
112-
out.write(value[i][0]+"\t"+"T"+"\n")
112+
out.write(f'{value[i][0]}\tT\n')
113113
for i in range(len(value)-1,-1,-1):
114114
if value[i][3] == -1:
115115
if len(flagr) == 3:
116-
out.write(value[i][0]+"\t"+"T"+"\n")
116+
out.write(f'{value[i][0]}\tT\n')
117117
else:
118118
if value[i][2] % 3 not in flagr:
119119
flagr.add(value[i][2] % 3)
120120
detect_sequence(value[i],seq,out)
121121
else:
122-
out.write(value[i][0]+"\t"+"T"+"\n")
122+
out.write(f'{value[i][0]}\tT\n')
123123

124124
def coordinate(infile,fasta_path,outfile):
125125
'''

General_Scripts/03_Quality_control/Coordinates/02_assign_all_level.py

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@
33
If the true rate in the family > 0.5,then the family will be true.
44
'''
55

6-
'''
7-
Assign 100AA
8-
'''
96
def assign_100(infile1,infile2,outfile):
107
import gzip
118
import lzma
@@ -15,19 +12,16 @@ def assign_100(infile1,infile2,outfile):
1512

1613
with gzip.open (infile1,"rt") as f1:
1714
for line in f1:
18-
line = line.strip()
19-
metag,tf = line.split("\t")
15+
metag,tf = line.strip().split("\t")
2016
tfdict[metag] = tf
2117

2218
with lzma.open(infile2,"rt") as f2:
2319
for line in f2:
24-
line = line.strip()
25-
linelist = line.split("\t")
26-
if linelist[0] in tfdict.keys():
27-
out.write(linelist[1]+"\t"+tfdict[linelist[0]]+"\n")
20+
member,cluster = line.strip().split("\t")
21+
if member in tfdict.keys():
22+
out.write(f'{member}\t{tfdict[member]}\n')
2823
else:
29-
out.write(linelist[1]+"\t"+linelist[0]+"\n")
30-
24+
out.write(f'{member}\tNA\n')
3125
out.close()
3226

3327
def assign_90(infile1,infile2,outfile1,outfile2):
@@ -42,51 +36,39 @@ def assign_90(infile1,infile2,outfile1,outfile2):
4236

4337
with gzip.open (infile1,"rt") as f1:
4438
for line in f1:
45-
line = line.strip()
46-
linelist = line.split("\t")
47-
tf[linelist[0]] = linelist[1]
39+
smorf,terminal = line.strip().split("\t")
40+
tf[smorf] = terminal
4841

4942
with lzma.open(infile2,"rt") as f2:
5043
for line in f2:
51-
line = line.strip()
52-
linelist = line.split("\t")
53-
if linelist[1] != "":
54-
if linelist[1] in cluster_90.keys():
55-
if tf[linelist[0]] == "T":
56-
cluster_90[linelist[1]][0] += 1
57-
elif tf[linelist[0]] == "F":
58-
cluster_90[linelist[1]][1] += 1
59-
else:
60-
cluster_90[linelist[1]][2] += 1
61-
else:
62-
cluster_90[linelist[1]] = [0,0,0]
63-
if tf[linelist[0]] == "T":
64-
cluster_90[linelist[1]][0] += 1
65-
elif tf[linelist[0]] == "F":
66-
cluster_90[linelist[1]][1] += 1
67-
else:
68-
cluster_90[linelist[1]][2] += 1
44+
member,cluster = line.strip().split("\t")
45+
if cluster not in cluster_90.keys():
46+
cluster_90[cluster] = [0,0,0]
47+
if tf[member] == "T":
48+
cluster_90[cluster][0] += 1
49+
elif tf[member] == "F":
50+
cluster_90[cluster][1] += 1
51+
else:
52+
cluster_90[cluster][2] += 1
6953

7054
for key,value in cluster_90.items():
71-
out1.write(key+"\t"+str(value[0])+"\t"+str(value[1])+"\t"+str(value[2])+"\t"+str(value[0]/(value[0]+value[1]+value[2]))+"\n")
55+
out1.write(f'{key}\t{value[0]}\t{value[1]}\t{value[2]}\t{value[0]/(value[0]+value[1]+value[2])}\n')
7256
out1.close()
7357

7458
with gzip.open (outfile1,"rt") as f3:
7559
for line in f3:
76-
line = line.strip()
77-
linelist = line.split("\t",)
60+
linelist = line.strip().split("\t")
7861
if float(linelist[4]) > 0.5:
79-
out2.write(linelist[0]+"\t"+"T"+"\n")
62+
out2.write(f'{linelist[0]}\tT\n')
8063
else:
81-
out2.write(linelist[0]+"\t"+"F"+"\n")
64+
out2.write(f'{linelist[0]}\tF\n')
8265
out2.close()
8366

84-
INPUT_FILE_1 = "./coordinate/all/result.tsv.gz"
85-
INPUT_FILE_2 = "./data/frozen/100AA_rename.tsv.xz"
86-
INPUT_FILE_3 = "./data/frozen/all_0.9_0.5_family.tsv.xz"
87-
OUTPUT_FILE_1 = "./coordinate/all/100AA_coordinate.tsv.gz"
88-
OUTPUT_FILE_2 = "./coordinate/all/90AA_T_F_iso_rate.tsv.gz"
89-
OUTPUT_FILE_3 = "./coordinate/all/90AA_coordinate.tsv.gz"
67+
INPUT_FILE_1 = "result.tsv.gz"
68+
INPUT_FILE_2 = "GMSC.cluster.tsv.gz"
69+
OUTPUT_FILE_1 = "100AA_coordinate.tsv.gz"
70+
OUTPUT_FILE_2 = "90AA_T_F_iso_rate.tsv.gz"
71+
OUTPUT_FILE_3 = "90AA_coordinate.tsv.gz"
9072

9173
assign_100(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
92-
assign_90(OUTPUT_FILE_1,INPUT_FILE_3,OUTPUT_FILE_2,OUTPUT_FILE_3)
74+
assign_90(OUTPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_2,OUTPUT_FILE_3)

0 commit comments

Comments
 (0)