Skip to content

Commit 05c339f

Browse files
committed
ENH change taxonomy to normalized format
1 parent 72272e0 commit 05c339f

File tree

10 files changed

+78
-230
lines changed

10 files changed

+78
-230
lines changed

examples/ref_habitat.txt

Lines changed: 0 additions & 89 deletions
This file was deleted.

examples/ref_taxonomy.npy

484 Bytes
Binary file not shown.

examples/ref_taxonomy.txt

Lines changed: 0 additions & 89 deletions
This file was deleted.

examples/ref_taxonomy_index.tsv

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
0 Unknown
2+
1 d__Archaea;p__Thermoproteota;c__Nitrososphaeria;o__Nitrososphaerales;f__Nitrososphaeraceae;g__Nitrososphaera;s__Nitrososphaera sp002494895
3+
2 d__Archaea;p__Thermoproteota;c__Nitrososphaeria;o__Nitrososphaerales;f__Nitrososphaeraceae;g__UBA10452;s__UBA10452 sp003176995
4+
3 d__Bacteria
5+
4 d__Bacteria;p__Actinobacteriota;c__Actinomycetia
6+
5 d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465
7+
6 d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Streptosporangiales;f__Streptosporangiaceae
8+
7 d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Streptosporangiales;f__Streptosporangiaceae;g__UBA9676;s__UBA9676 sp003541285
9+
8 d__Bacteria;p__Actinobacteriota;c__Thermoleophilia;o__Solirubrobacterales;f__Solirubrobacteraceae;g__Solirubrobacter
10+
9 d__Bacteria;p__Cyanobacteria;c__Cyanobacteriia
11+
10 d__Bacteria;p__Deinococcota;c__Deinococci;o__Deinococcales
12+
11 d__Bacteria;p__Firmicutes_A
13+
12 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Enterocloster;s__Enterocloster sp900551225
14+
13 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales
15+
14 d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__UBA1242;g__UBA6345;s__UBA6345 sp002437945
16+
15 d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__UBA1242;g__UMGS687;s__UMGS687 sp900545735
17+
16 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__SYFN01;s__SYFN01 sp005800405
18+
17 d__Bacteria;p__SAR324;c__SAR324;o__SAR324;f__NAC60-12;g__Arctic96AD-7;s__Arctic96AD-7 sp002685535
19+
18 d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Pedosphaerales;f__Pedosphaeraceae;g__UBA11358

gmsc_mapper/main.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,13 @@ def parse_args(args):
127127
required=False,
128128
help='Path to the taxonomy file',
129129
dest='taxonomy',
130-
default=path.join(_ROOT, 'db/ref_taxonomy.tsv.xz'))
130+
default=path.join(_ROOT, 'db/ref_taxonomy.npy'))
131+
132+
parser.add_argument('--taxonomy-index', '--taxonomy-index',
133+
required=False,
134+
help='Path to the taxonomy index file',
135+
dest='taxonomyindex',
136+
default=path.join(_ROOT, 'db/ref_taxonomy_index.tsv'))
131137

132138
parser.add_argument('--quality', '--quality',
133139
required=False,
@@ -419,7 +425,7 @@ def habitat(args,resultfile):
419425
def taxonomy(args,resultfile,tmpdirname):
420426
from gmsc_mapper.map_taxonomy import deep_lca,taxa_summary
421427
print('Start taxonomy annotation...')
422-
deep_lca(args.taxonomy,args.output,resultfile,tmpdirname)
428+
deep_lca(args.taxonomyindex,args.taxonomy,args.output,resultfile,tmpdirname)
423429
annotated_number,rank_number,rank_percentage = taxa_summary(args.output)
424430
print('taxonomy annotation has done.\n')
425431
return annotated_number,rank_number,rank_percentage

gmsc_mapper/map_taxonomy.py

Lines changed: 33 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,52 @@
11
import numpy as np
22
import pandas as pd
3-
43
from os import path
54

6-
def smorf_taxonomy(taxfile:str, resultfile:str ,tmpdirname: str) -> str:
7-
print('Start taxonomy mapping...')
5+
def store_index(indexfile):
6+
index_tax = pd.read_csv(indexfile,
7+
sep='\t',
8+
header=None,
9+
names=['index','taxonomy'])
10+
index_tax_dict = index_tax['taxonomy'].to_dict()
11+
return index_tax_dict
12+
13+
def smorf_taxonomy(index_tax_dict,taxfile,resultfile,tmpdirname):
14+
print('Start taxonomy mapping...')
15+
tax = np.load(taxfile,mmap_mode='r')
16+
taxonomy_file = path.join(tmpdirname,"taxonomy.out.smorfs.tmp.tsv")
17+
818
result = pd.read_csv(resultfile,
919
sep="\t",
1020
header=None)
11-
12-
result = result.rename({0:'qseqid',
13-
1:'sseqid'},
14-
axis=1)
21+
result.rename({0: 'qseqid', 1: 'sseqid'},
22+
axis=1,
23+
inplace=True)
24+
result['sseqid'] = result['sseqid'].apply(lambda x: int(x.split('.')[2].replace('_','')))
25+
mapped_sseqid = result['sseqid'].to_list()
1526

16-
taxonomy_file = path.join(tmpdirname,
17-
"taxonomy.out.smorfs.tmp.tsv")
18-
19-
reader = pd.read_table(taxfile,
20-
sep="\t",
21-
chunksize=5_000_000,
22-
header=None,
23-
names=['sseqid', 'taxonomy'])
24-
25-
output_list = []
26-
for chunk in reader:
27-
output_chunk = result.merge(on='sseqid',
28-
right=chunk,
29-
how='inner')
30-
output_chunk = output_chunk[['qseqid',
31-
'sseqid',
32-
'taxonomy']]
33-
output_list.append(output_chunk)
34-
35-
output = pd.concat(output_list,
36-
axis=0)
37-
38-
output = output.sort_values(by='qseqid')
27+
mapped_sseqid_tax = {}
28+
for item in mapped_sseqid:
29+
mapped_sseqid_tax[item] = index_tax_dict[tax[item]]
30+
31+
result['taxonomy'] = result['sseqid'].map(lambda g: mapped_sseqid_tax.get(g))
32+
result = result[['qseqid', 'sseqid', 'taxonomy']]
33+
34+
result = result.sort_values(by='qseqid')
3935

40-
output.to_csv(taxonomy_file,
36+
result.to_csv(taxonomy_file,
4137
sep='\t',
4238
index=False,
4339
header=False)
4440

4541
print('Finish taxonomy mapping.')
4642
return taxonomy_file
4743

48-
4944
def fixformat(x):
5045
x = x.split(';')
5146
while len(x) < 7:
5247
x.append('')
5348
return ';'.join(x)
5449

55-
5650
def reducetab(df):
5751
df = df.drop_duplicates()
5852
cols = list(df.columns)
@@ -77,9 +71,11 @@ def reducetab(df):
7771
tax_flag = tax_flag[:-1]
7872
return tax_flag
7973

74+
def deep_lca(indexfile,taxfile, outdirname, resultfile, tmpdirname):
75+
index_tax_dict = store_index(indexfile)
8076

81-
def deep_lca(taxfile, outdirname, resultfile, tmpdirname):
82-
taxonomy_file = smorf_taxonomy(taxfile,
77+
taxonomy_file = smorf_taxonomy(index_tax_dict,
78+
taxfile,
8379
resultfile,
8480
tmpdirname)
8581

@@ -89,8 +85,8 @@ def deep_lca(taxfile, outdirname, resultfile, tmpdirname):
8985
data = pd.read_table(taxonomy_file,
9086
header=None,
9187
names=['smorf', 'gmsc', 'taxonomy'])
92-
93-
data = data.fillna('')
88+
89+
data['taxonomy'].replace('Unknown','',inplace=True)
9490
data.taxonomy = data.taxonomy.apply(lambda x: fixformat(x))
9591
data = data.groupby('smorf').apply(lambda x: x.taxonomy.tolist())
9692
data = data.reset_index()
@@ -113,7 +109,6 @@ def deep_lca(taxfile, outdirname, resultfile, tmpdirname):
113109
header=True,
114110
index=None)
115111

116-
117112
def taxa_summary(outdir):
118113
taxonomy_dlca_file = path.join(outdir,"taxonomy.out.smorfs.tsv")
119114

tests.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ gmsc-mapper createdb -i examples/target.faa -o examples/ -m diamond --quiet
88
gmsc-mapper createdb -i examples/target.faa -o examples/ -m mmseqs --quiet
99

1010
echo "Testing basic usage"
11-
gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.txt --quiet
11+
gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --quiet
1212
python tests/diamond_contig.py
13-
gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
13+
gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --quiet
1414
python tests/diamond_protein.py
15-
gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
15+
gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --quiet
1616
python tests/diamond_gene.py
1717

1818
echo "Testing tool flag - MMSeqs"
19-
gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --tool mmseqs --quiet
19+
gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --tool mmseqs --quiet
2020
python tests/mmseqs_contig.py

tests/test_taxonomy.npy

148 Bytes
Binary file not shown.

tests/test_taxonomy.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
import sys
22
sys.path.append("..")
3-
from gmsc_mapper.map_taxonomy import smorf_taxonomy,deep_lca
3+
from gmsc_mapper.map_taxonomy import store_index,smorf_taxonomy,deep_lca
44
import pytest
55
import os
66

7-
known_mapped_taxonomy = ["smORF_0\tGMSC10.90AA.000_000_000_004\td__Bacteria",
8-
"smORF_1\tGMSC10.90AA.000_000_000_003\t",
9-
"smORF_2\tGMSC10.90AA.000_000_000_002\td__Bacteria;p__Firmicutes_A",
10-
"smORF_2\tGMSC10.90AA.000_000_000_001\t",
11-
"smORF_2\tGMSC10.90AA.000_000_000_000\td__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465"]
7+
known_mapped_taxonomy = ["smORF_0\t4\td__Bacteria",
8+
"smORF_1\t3\tUnknown",
9+
"smORF_2\t2\td__Bacteria;p__Firmicutes_A",
10+
"smORF_2\t1\tUnknown",
11+
"smORF_2\t0\td__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465"]
1212
mapped_taxonomy = []
1313
def test_smorf_taxonomy():
14-
taxonomy_file = smorf_taxonomy('./tests/test_taxonomy.txt','./tests/alignment.tsv',os.path.dirname(os.path.realpath(__file__)))
14+
index_tax_dict = store_index('./tests/test_taxonomy_index.tsv')
15+
taxonomy_file = smorf_taxonomy(index_tax_dict,'./tests/test_taxonomy.npy','./tests/alignment.tsv',os.path.dirname(os.path.realpath(__file__)))
1516
with open(taxonomy_file,"rt") as f:
1617
for line in f:
1718
mapped_taxonomy.append(line.replace("\n",""))
@@ -20,7 +21,8 @@ def test_smorf_taxonomy():
2021
known_taxonomy = ["q_seqid\ttaxonomy","smORF_0\td__Bacteria","smORF_1\t","smORF_2\td__Bacteria"]
2122
deep_lca_taxonomy = []
2223
def test_deep_lca():
23-
deep_lca('./tests/test_taxonomy.txt',
24+
deep_lca('./tests/test_taxonomy_index.tsv',
25+
'./tests/test_taxonomy.npy',
2426
os.path.dirname(os.path.realpath(__file__)),
2527
'./tests/alignment.tsv',
2628
os.path.dirname(os.path.realpath(__file__)))

0 commit comments

Comments
 (0)