Skip to content

Commit 72272e0

Browse files
committed
ENH change habitat to normalized format
1 parent d9ba509 commit 72272e0

14 files changed

+92
-63
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ The output folder will contain
241241

242242
* `--noquality`: Use this if no need to annotate quality. (default: False)
243243

244+
* `--quiet`: Disable alignment console output. (default:False)
245+
244246
* `--db`: Path to the GMSC database file. (default: ../db/targetdb.dmnd)
245247

246248
* `--habitat`: Path to the habitat file. (default: ../db/ref_habitat.tsv.xz)
@@ -258,5 +260,7 @@ Subcommands: `gmsc-mapper createdb`
258260

259261
* `-m/--mode`: Alignment tool (Diamond / MMseqs2).
260262

263+
* `--quiet`: Disable alignment console output. (default:False)
264+
261265
## Sensitivity choices considering time and memory usage
262266
To be done

examples/ref_habitat.npy

484 Bytes
Binary file not shown.

examples/ref_habitat_index.tsv

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
0 air
2+
1 annelidae associated
3+
2 anthropogenic
4+
3 built environment
5+
4 built environment,human skin
6+
5 chicken gut
7+
6 coral associated,marine
8+
7 human gut
9+
8 human gut,isolate
10+
9 human skin
11+
10 isolate
12+
11 lake associated
13+
12 lake associated,river associated
14+
13 lake associated,water associated
15+
14 marine
16+
15 marine,isolate
17+
16 marine,wastewater,water associated
18+
17 marine,water associated
19+
18 plant associated
20+
19 river associated
21+
20 soil
22+
21 termite gut
23+
22 wastewater
24+
23 water associated

gmsc_mapper/main.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,13 @@ def parse_args(args):
115115
required=False,
116116
help='Path to the habitat file',
117117
dest='habitat',
118-
default=path.join(_ROOT, 'db/ref_habitat.tsv.xz'))
118+
default=path.join(_ROOT, 'db/ref_habitat.npy'))
119+
120+
parser.add_argument('--habitat-index', '--habitat-index',
121+
required=False,
122+
help='Path to the habitat index file',
123+
dest='habitatindex',
124+
default=path.join(_ROOT, 'db/ref_habitat_index.tsv'))
119125

120126
parser.add_argument('--taxonomy', '--taxonomy',
121127
required=False,
@@ -406,7 +412,7 @@ def generate_fasta(output,queryfile,resultfile):
406412
def habitat(args,resultfile):
407413
from gmsc_mapper.map_habitat import smorf_habitat
408414
print('Start habitat annotation...')
409-
single_number,single_percentage,multi_number,multi_percentage = smorf_habitat(args.output,args.habitat,resultfile)
415+
single_number,single_percentage,multi_number,multi_percentage = smorf_habitat(args.habitatindex,args.output,args.habitat,resultfile)
410416
print('habitat annotation has done.\n')
411417
return single_number,single_percentage,multi_number,multi_percentage
412418

gmsc_mapper/map_habitat.py

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,54 @@
11
import pandas as pd
2-
2+
import numpy as np
33
from os import path
44

5-
65
def fixdf(x):
76
x = x.dropna()
87
x = x.drop_duplicates()
98
return ','.join(x)
109

11-
1210
def formatlabel(x):
1311
x = x.split(',')
1412
x = list(set(x))
1513
x = sorted(x)
1614
return ','.join(x)
17-
18-
19-
def smorf_habitat(outdir, habitatfile, resultfile):
15+
16+
def store_index(indexfile):
17+
index_habitat = pd.read_csv(indexfile,
18+
sep='\t',
19+
header=None,
20+
names=['index','habitat'])
21+
index_habitat_dict = index_habitat['habitat'].to_dict()
22+
return index_habitat_dict
23+
24+
def smorf_habitat(indexfile, outdir, habitatfile, resultfile):
2025
habitat_file = path.join(outdir, "habitat.out.smorfs.tsv")
2126

2227
result = pd.read_csv(resultfile,
2328
sep='\t',
24-
header=None)
25-
29+
header=None)
2630
result.rename({0: 'qseqid', 1: 'sseqid'},
2731
axis=1,
2832
inplace=True)
29-
30-
reader = pd.read_table(habitatfile,
31-
sep="\t",
32-
chunksize=5_000_000,
33-
header=None,
34-
names=['sseqid', 'habitat'])
33+
result['sseqid'] = result['sseqid'].apply(lambda x: int(x.split('.')[2].replace('_','')))
34+
mapped_sseqid = result['sseqid'].to_list()
3535

36-
output_list = []
37-
for chunk in reader:
38-
output_chunk = result.merge(on='sseqid',
39-
right=chunk,
40-
how='left')
41-
output_chunk = output_chunk[['qseqid', 'habitat']]
42-
output_list.append(output_chunk)
43-
44-
output = pd.concat(output_list,
45-
axis=0)
46-
36+
index_habitat_dict = store_index(indexfile)
37+
38+
habitat = np.load(habitatfile,mmap_mode='r')
39+
40+
mapped_sseqid_habitat = {}
41+
for item in mapped_sseqid:
42+
mapped_sseqid_habitat[item] = index_habitat_dict[habitat[item]]
43+
result['habitat'] = result['sseqid'].map(lambda g: mapped_sseqid_habitat.get(g))
44+
45+
output = result[['qseqid', 'habitat']]
4746
output = output.sort_values(by='qseqid')
48-
4947
output = output.groupby('qseqid',
5048
as_index=False,
51-
sort=False)
52-
49+
sort=False)
5350
output = output.agg({'habitat':lambda x : fixdf(x)})
5451
output['habitat'] = output['habitat'].apply(lambda x: formatlabel(x))
55-
5652
output.to_csv(habitat_file,
5753
sep='\t',
5854
index=False)
@@ -71,5 +67,4 @@ def smorf_habitat(outdir, habitatfile, resultfile):
7167
multi_number = output['habitat'].size - single_number
7268
multi_percentage = 1 - single_percentage
7369

74-
return (single_number, single_percentage, multi_number, multi_percentage)
75-
70+
return (single_number, single_percentage, multi_number, multi_percentage)

tests.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ gmsc-mapper createdb -i examples/target.faa -o examples/ -m diamond --quiet
88
gmsc-mapper createdb -i examples/target.faa -o examples/ -m mmseqs --quiet
99

1010
echo "Testing basic usage"
11-
gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.txt --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.txt --quiet
11+
gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.txt --quiet
1212
python tests/diamond_contig.py
13-
gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
13+
gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
1414
python tests/diamond_protein.py
15-
gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
15+
gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
1616
python tests/diamond_gene.py
1717

1818
echo "Testing tool flag - MMSeqs"
19-
gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --tool mmseqs --quiet
19+
gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --tool mmseqs --quiet
2020
python tests/mmseqs_contig.py

tests/alignment.tsv

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
smORF_0 GMSC10.90AA.000_257_823_465 MVFVLLSEMYPTKVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ* MSICAVVFVLLSEMYPTRVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ 89 93 98.9 87 1.91e-56 97.8 93.5
2-
smORF_1 GMSC10.90AA.000_279_368_202 MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLQMLAGQ* MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLHILAGQ 40 39 94.9 39 2.40e-19 97.5 100
3-
smORF_2 GMSC10.90AA.000_276_471_764 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV 97 90 100 90 1.65e-54 92.8 100
4-
smORF_2 GMSC10.90AA.000_265_853_435 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MDELTKMGATIQVDGRTAIITGVEGFTGADVEAPDLRAGAALVIAGLAAKGFTTVSEIGYISRGYEDFEKKLRSLGGEIKMVNDEKEIAKFKLKIG 97 96 81.1 95 1.39e-45 97.9 99.0
5-
smORF_2 GMSC10.90AA.000_287_349_677 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MDALTKMGATIQVDGRTAIISGVEGFTGADVHAPDLRAGAALVIAGLSAKGFTTVSDIGYIYRGYEQFEQKLKQLGGEIQLVNNEKEVAKFKLKIG 97 96 80.0 95 1.97e-45 97.9 99.0
1+
smORF_0 GMSC10.90AA.000_000_000_004 MVFVLLSEMYPTKVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ* MSICAVVFVLLSEMYPTRVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ 89 93 98.9 87 1.91e-56 97.8 93.5
2+
smORF_1 GMSC10.90AA.000_000_000_003 MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLQMLAGQ* MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLHILAGQ 40 39 94.9 39 2.40e-19 97.5 100
3+
smORF_2 GMSC10.90AA.000_000_000_002 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV 97 90 100 90 1.65e-54 92.8 100
4+
smORF_2 GMSC10.90AA.000_000_000_001 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MDELTKMGATIQVDGRTAIITGVEGFTGADVEAPDLRAGAALVIAGLAAKGFTTVSEIGYISRGYEDFEKKLRSLGGEIKMVNDEKEIAKFKLKIG 97 96 81.1 95 1.39e-45 97.9 99.0
5+
smORF_2 GMSC10.90AA.000_000_000_000 MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV* MDALTKMGATIQVDGRTAIISGVEGFTGADVHAPDLRAGAALVIAGLSAKGFTTVSDIGYIYRGYEQFEQKLKQLGGEIQLVNNEKEVAKFKLKIG 97 96 80.0 95 1.97e-45 97.9 99.0

tests/test_habitat.npy

148 Bytes
Binary file not shown.

tests/test_habitat.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@
55
import os
66

77
known_habitat = {"qseqid":"habitat",
8-
"smORF_0":"human gut",
9-
"smORF_1":"marine,wastewater,water associated",
10-
"smORF_2":"marine,soil,water associated"}
8+
"smORF_0":"soil",
9+
"smORF_1":"marine,water associated",
10+
"smORF_2":"human gut,marine,soil,wastewater,water associated"}
1111
habitat_dict = {}
1212
def test_habitat():
13-
smorf_habitat(os.path.dirname(os.path.realpath(__file__)),'./tests/test_habitat.txt','./tests/alignment.tsv')
13+
smorf_habitat('./tests/test_habitat_index.txt', os.path.dirname(os.path.realpath(__file__)), './tests/test_habitat.npy', './tests/alignment.tsv')
1414
with open('./tests/habitat.out.smorfs.tsv',"rt") as f:
1515
for line in f:
1616
qseqid,habitat = line.strip().split("\t")
1717
habitat_dict[qseqid] = habitat
18+
print(habitat_dict)
1819
assert habitat_dict == known_habitat
1920

2021
if __name__ == '__main__':

tests/test_habitat.txt

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)