Skip to content

Commit 54b95e8

Browse files
committed
ENH update 07_GMSC_mapper_benchmark
1 parent 53d2fd8 commit 54b95e8

File tree

5 files changed

+57
-28
lines changed

5 files changed

+57
-28
lines changed

General_Scripts/07_GMSC_mapper_benchmark/01_sensitivity/01_sensitivity.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ for n in {1..7}
77
do
88
for m in 30 40 60 80
99
do
10-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ~/mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ~/mapper/benchmark/sensitivity/result/${m}_diamond_${n}
11-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --tool mmseqs --dbdir ~/mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ~/mapper/benchmark/sensitivity/result/${m}_mmseqs_${n}
10+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ./mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ./${m}_diamond_${n}
11+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --tool mmseqs --dbdir ./mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ./${m}_mmseqs_${n}
1212
done
13-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select.faa --dbdir ~/mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ~/mapper/benchmark/sensitivity/result/all_diamond_${n}
14-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select.faa --tool mmseqs --dbdir ~/mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ~/mapper/benchmark/sensitivity/result/all_mmseqs_${n}
13+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select.faa --dbdir ./mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ./all_diamond_${n}
14+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select.faa --tool mmseqs --dbdir ./mapper_index -t 20 -s ${n} --cov 0 -e 0.01 -o ./all_mmseqs_${n}
1515
done

General_Scripts/07_GMSC_mapper_benchmark/02_time/01_time.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@
55

66
for n in 1000 10000 100000 1000000
77
do
8-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${n}.faa --dbdir ~/mapper_index -o ~/benchmark/diamond_aa_${n} -t 20 -s 4 --cov 0.9 --id 0.9 -e 0.001
9-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${n}.faa --dbdir ~/mapper_index -o ~/benchmark/mmseqs_aa_${n} -t 20 --cov 0.9 --id 0.9 -e 0.001 --tool mmseqs
8+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${n}.faa --dbdir ./mapper_index -o ./diamond_aa_${n} -t 20 -s 4 --cov 0.9 --id 0.9 -e 0.001
9+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${n}.faa --dbdir ./mapper_index -o ./mmseqs_aa_${n} -t 20 --cov 0.9 --id 0.9 -e 0.001 --tool mmseqs
1010
done

General_Scripts/07_GMSC_mapper_benchmark/03_identity/01_select_mutation.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
'''
2+
Concept:
23
Randomly selected and mutated 10,000 sequences from 90AA smORFs with different length (20,30,40,60,80,all) at different identity.
34
'''
45
from fasta import fasta_iter
56
import random
67

78
def select(infile,outfile,GMSC):
89
selected = frozenset(random.sample(range(GMSC),10000))
9-
with open(outfile,'w') as out:
10+
with open(outfile,'wt') as out:
1011
n=0
1112
for ID,seq in fasta_iter(infile):
1213
n+=1
@@ -23,7 +24,7 @@ def store(infile,l):
2324
def select_length(lengthset,outfile,l):
2425
count = len(lengthset[l])
2526
selected = frozenset(random.sample(range(count),10000))
26-
with open(outfile,'w') as out:
27+
with open(outfile,'wt') as out:
2728
n=0
2829
for sequence in lengthset[l]:
2930
n+=1
@@ -32,20 +33,20 @@ def select_length(lengthset,outfile,l):
3233

3334
def mutation(infile,outfile,number,length):
3435
aminoacid = ["A","R","N","D","C","Q","E","G","H","I","L","K","F","M","P","S","T","W","Y","V","X"]
35-
out = open(outfile,"wt")
36-
for ID,seq in fasta_iter(infile):
37-
selected = frozenset(random.sample(range(length),int(length-length*number)))
38-
for i in selected:
39-
seq = list(seq)
40-
loc = aminoacid.index(seq[i])
41-
if loc != 20:
42-
seq[i] = aminoacid[loc+1]
43-
else:
44-
seq[i] = aminoacid[0]
45-
seq = ''.join(seq)
46-
out.write(f'>{ID}\n{seq}\n')
36+
with open(outfile,'wt') as out:
37+
for ID,seq in fasta_iter(infile):
38+
selected = frozenset(random.sample(range(length),int(length-length*number)))
39+
for i in selected:
40+
seq = list(seq)
41+
loc = aminoacid.index(seq[i])
42+
if loc != 20:
43+
seq[i] = aminoacid[loc+1]
44+
else:
45+
seq[i] = aminoacid[0]
46+
seq = ''.join(seq)
47+
out.write(f'>{ID}\n{seq}\n')
4748

48-
INPUT_FILE_1 = "90AA_GMSC_sort.faa"
49+
INPUT_FILE_1 = "90AA_GMSC.faa"
4950
OUT_FILE_1 = "GMSC_90_select.faa"
5051
GMSC_90 = 287926875
5152
identity = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
@@ -56,7 +57,7 @@ def mutation(infile,outfile,number,length):
5657
for l in length:
5758
OUT_FILE_2 = f'GMSC_90_select_{str(l)}.faa'
5859
lengthset = store(INPUT_FILE_1,l)
59-
select(lengthset,OUT_FILE_2,l)
60+
select_length(lengthset,OUT_FILE_2,l)
6061

6162
for i in identity:
6263
for l in length:

General_Scripts/07_GMSC_mapper_benchmark/03_identity/02_identity.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ for m in 20 30 40 60 80
77
do
88
for n in 0.9 0.8 0.7 0.6 0.5 0.4 0.3 0.2 0.1
99
do
10-
/usr/bin/time -v blastp -query GMSC_90_select_${m}_${n}.faa -out ~/benchmark/identity/result/blast/${m}_blast_${n}.tsv -db ~/blastdb/90AA_GMSC_new -evalue 0.01 -outfmt "6 qseqid sseqid qlen slen pident length evalue qcovs" -num_threads 20
11-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}_${n}.faa --dbdir ~/mapper_index -t 20 -s 4 --cov 0 -e 0.01 -o ~/benchmark/identity/result/${m}_diamond_${n}
12-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}_${n}.faa --dbdir ~/mapper_index -t 20 --cov 0 -e 0.01 -o ~/benchmark/identity/result/${m}_mmseqs_${n} --tool mmseqs
10+
/usr/bin/time -v blastp -query GMSC_90_select_${m}_${n}.faa -out ./${m}_blast_${n}.tsv -db ~/blastdb/90AA_GMSC_new -evalue 0.01 -outfmt "6 qseqid sseqid qlen slen pident length evalue qcovs" -num_threads 20
11+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}_${n}.faa --dbdir ./mapper_index -t 20 -s 4 --cov 0 -e 0.01 -o ./${m}_diamond_${n}
12+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}_${n}.faa --dbdir ./mapper_index -t 20 --cov 0 -e 0.01 -o ./${m}_mmseqs_${n} --tool mmseqs
1313
done
14-
/usr/bin/time -v blastp -query GMSC_90_select_${m}.faa -out ~/benchmark/identity/result/blast/${m}_blast_1.tsv -db ~/blastdb/90AA_GMSC_new -evalue 0.01 -outfmt "6 qseqid sseqid qlen slen pident length evalue qcovs" -num_threads 20
15-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ~/mapper_index -t 20 -s 4 --cov 0 -e 0.01 -o ~/benchmark/identity/result/${m}_diamond_1
16-
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ~/mapper_index -t 20 --cov 0 -e 0.01 -o ~/benchmark/identity/result/${m}_mmseqs_1 --tool mmseqs
14+
/usr/bin/time -v blastp -query GMSC_90_select_${m}.faa -out ./${m}_blast_1.tsv -db ./90AA_GMSC_new -evalue 0.01 -outfmt "6 qseqid sseqid qlen slen pident length evalue qcovs" -num_threads 20
15+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ./mapper_index -t 20 -s 4 --cov 0 -e 0.01 -o ./${m}_diamond_1
16+
/usr/bin/time -v gmsc-mapper --aa-genes GMSC_90_select_${m}.faa --dbdir ./mapper_index -t 20 --cov 0 -e 0.01 -o ./${m}_mmseqs_1 --tool mmseqs
1717
done
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
def fasta_iter(fname, full_header=False):
2+
header = None
3+
chunks = []
4+
if fname.endswith('.gz'):
5+
import gzip
6+
op = gzip.open
7+
elif fname.endswith('.xz'):
8+
import lzma
9+
op = lzma.open
10+
else:
11+
op = open
12+
with op(fname, 'rt') as f:
13+
for line in f:
14+
if line[0] == '>':
15+
if header is not None:
16+
yield header,''.join(chunks)
17+
line = line[1:].strip()
18+
if not line:
19+
header = ''
20+
elif full_header:
21+
header = line.strip()
22+
else:
23+
header = line.split()[0]
24+
chunks = []
25+
else:
26+
chunks.append(line.strip())
27+
if header is not None:
28+
yield header, ''.join(chunks)

0 commit comments

Comments
 (0)