ENH change habitat to normalized format

cocodyq · cocodyq · commit 72272e0172ad · 2022-11-06T17:35:25.000+08:00
diff --git a/README.md b/README.md
@@ -241,6 +241,8 @@ The output folder will contain
 
 * `--noquality`: Use this if no need to annotate quality. (default: False)
 
+* `--quiet`: Disable alignment console output. (default:False)
+
 * `--db`: Path to the GMSC database file. (default: ../db/targetdb.dmnd)
 
 * `--habitat`: Path to the habitat file. (default: ../db/ref_habitat.tsv.xz)
@@ -258,5 +260,7 @@ Subcommands: `gmsc-mapper createdb`
 
 * `-m/--mode`: Alignment tool (Diamond / MMseqs2).
 
+* `--quiet`: Disable alignment console output. (default:False)
+
 ## Sensitivity choices considering time and memory usage
 To be done
diff --git a/examples/ref_habitat.npy b/examples/ref_habitat.npy
diff --git a/examples/ref_habitat_index.tsv b/examples/ref_habitat_index.tsv
@@ -0,0 +1,24 @@
+0	air
+1	annelidae associated
+2	anthropogenic
+3	built environment
+4	built environment,human skin
+5	chicken gut
+6	coral associated,marine
+7	human gut
+8	human gut,isolate
+9	human skin
+10	isolate
+11	lake associated
+12	lake associated,river associated
+13	lake associated,water associated
+14	marine
+15	marine,isolate
+16	marine,wastewater,water associated
+17	marine,water associated
+18	plant associated
+19	river associated
+20	soil
+21	termite gut
+22	wastewater
+23	water associated
diff --git a/gmsc_mapper/main.py b/gmsc_mapper/main.py
@@ -115,7 +115,13 @@ def parse_args(args):
                         required=False,
                         help='Path to the habitat file',
                         dest='habitat',
-                        default=path.join(_ROOT, 'db/ref_habitat.tsv.xz'))
+                        default=path.join(_ROOT, 'db/ref_habitat.npy'))
+
+    parser.add_argument('--habitat-index', '--habitat-index',
+                        required=False,
+                        help='Path to the habitat index file',
+                        dest='habitatindex',
+                        default=path.join(_ROOT, 'db/ref_habitat_index.tsv'))
 
     parser.add_argument('--taxonomy', '--taxonomy',
                         required=False,
@@ -406,7 +412,7 @@ def generate_fasta(output,queryfile,resultfile):
 def habitat(args,resultfile):
     from gmsc_mapper.map_habitat import smorf_habitat
     print('Start habitat annotation...')
-    single_number,single_percentage,multi_number,multi_percentage = smorf_habitat(args.output,args.habitat,resultfile)
+    single_number,single_percentage,multi_number,multi_percentage = smorf_habitat(args.habitatindex,args.output,args.habitat,resultfile)
     print('habitat annotation has done.\n')
     return single_number,single_percentage,multi_number,multi_percentage 
 
diff --git a/gmsc_mapper/map_habitat.py b/gmsc_mapper/map_habitat.py
@@ -1,58 +1,54 @@
 import pandas as pd
-
+import numpy as np
 from os import path
 
-
 def fixdf(x):
     x = x.dropna()
     x = x.drop_duplicates()
     return ','.join(x)
     
-    
 def formatlabel(x):
     x = x.split(',')
     x = list(set(x))
     x = sorted(x)
     return ','.join(x)
-    
-        
-def smorf_habitat(outdir, habitatfile, resultfile):
+
+def store_index(indexfile): 
+    index_habitat = pd.read_csv(indexfile,
+                                sep='\t',
+                                header=None,
+                                names=['index','habitat'])
+    index_habitat_dict = index_habitat['habitat'].to_dict()
+    return index_habitat_dict
+
+def smorf_habitat(indexfile, outdir, habitatfile, resultfile):
     habitat_file = path.join(outdir, "habitat.out.smorfs.tsv")	
 
     result = pd.read_csv(resultfile,
                          sep='\t',
-                         header=None)
-                         
+                         header=None)                       
     result.rename({0: 'qseqid', 1: 'sseqid'},
                   axis=1,
                   inplace=True)
-                         
-    reader =  pd.read_table(habitatfile,
-                            sep="\t",
-                            chunksize=5_000_000,
-                            header=None,
-                            names=['sseqid', 'habitat'])
+    result['sseqid'] = result['sseqid'].apply(lambda x: int(x.split('.')[2].replace('_','')))
+    mapped_sseqid = result['sseqid'].to_list()
 
-    output_list = []
-    for chunk in reader:
-        output_chunk = result.merge(on='sseqid',
-                                    right=chunk,
-                                    how='left')
-        output_chunk = output_chunk[['qseqid', 'habitat']]
-        output_list.append(output_chunk)
-        
-    output = pd.concat(output_list,
-                       axis=0)
-    
+    index_habitat_dict = store_index(indexfile)
+
+    habitat = np.load(habitatfile,mmap_mode='r')
+
+    mapped_sseqid_habitat = {}
+    for item in mapped_sseqid:
+        mapped_sseqid_habitat[item] = index_habitat_dict[habitat[item]]
+    result['habitat'] = result['sseqid'].map(lambda g: mapped_sseqid_habitat.get(g))
+
+    output = result[['qseqid', 'habitat']]
     output = output.sort_values(by='qseqid')
-    
     output = output.groupby('qseqid',
                             as_index=False,
-                            sort=False)
-    
+                            sort=False) 
     output = output.agg({'habitat':lambda x : fixdf(x)})
     output['habitat'] = output['habitat'].apply(lambda x: formatlabel(x))
-    
     output.to_csv(habitat_file,
                   sep='\t',
                   index=False)
@@ -71,5 +67,4 @@ def smorf_habitat(outdir, habitatfile, resultfile):
     multi_number = output['habitat'].size - single_number
     multi_percentage = 1 - single_percentage
     
-    return (single_number, single_percentage, multi_number, multi_percentage)
-    
+    return (single_number, single_percentage, multi_number, multi_percentage)
diff --git a/tests.sh b/tests.sh
@@ -8,13 +8,13 @@ gmsc-mapper createdb -i examples/target.faa -o examples/ -m diamond --quiet
 gmsc-mapper createdb -i examples/target.faa -o examples/ -m mmseqs --quiet
 
 echo "Testing basic usage"
-gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.txt --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.txt --quiet
+gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.txt --quiet
 python tests/diamond_contig.py
-gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
+gmsc-mapper --aa-genes examples/example.faa -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
 python tests/diamond_protein.py
-gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
+gmsc-mapper --nt-genes examples/example.fna -o examples_output/ --db examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --quiet
 python tests/diamond_gene.py
 
 echo "Testing tool flag - MMSeqs"
-gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat examples/ref_habitat.txt --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --tool mmseqs --quiet
+gmsc-mapper -i examples/example.fa -o examples_output/ --db examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality examples/ref_quality.txt --taxonomy examples/ref_taxonomy.txt --tool mmseqs --quiet
 python tests/mmseqs_contig.py
diff --git a/tests/alignment.tsv b/tests/alignment.tsv
@@ -1,5 +1,5 @@
-smORF_0	GMSC10.90AA.000_257_823_465	MVFVLLSEMYPTKVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ*	MSICAVVFVLLSEMYPTRVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ	89	93	98.9	87	1.91e-56	97.8	93.5
-smORF_1	GMSC10.90AA.000_279_368_202	MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLQMLAGQ*	MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLHILAGQ 40	39	94.9	39	2.40e-19	97.5	100
-smORF_2	GMSC10.90AA.000_276_471_764	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV	97	90	100	90	1.65e-54	92.8	100
-smORF_2	GMSC10.90AA.000_265_853_435	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MDELTKMGATIQVDGRTAIITGVEGFTGADVEAPDLRAGAALVIAGLAAKGFTTVSEIGYISRGYEDFEKKLRSLGGEIKMVNDEKEIAKFKLKIG	97	96	81.1	95	1.39e-45	97.9	99.0
-smORF_2	GMSC10.90AA.000_287_349_677	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MDALTKMGATIQVDGRTAIISGVEGFTGADVHAPDLRAGAALVIAGLSAKGFTTVSDIGYIYRGYEQFEQKLKQLGGEIQLVNNEKEVAKFKLKIG	97	96	80.0	95	1.97e-45	97.9	99.0
+smORF_0	GMSC10.90AA.000_000_000_004	MVFVLLSEMYPTKVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ*	MSICAVVFVLLSEMYPTRVRGLAMSIAGFALWIGTYLIGQLTPWMLQNLTPAGTFFLFAVMCVPYMLIVWKLVPETTGKSLEEIERYWTRSEQ	89	93	98.9	87	1.91e-56	97.8	93.5
+smORF_1	GMSC10.90AA.000_000_000_003	MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLQMLAGQ*	MTFSVAGINAQGTTVIEDAECVDVSYPNFYEQLHILAGQ 40	39	94.9	39	2.40e-19	97.5	100
+smORF_2	GMSC10.90AA.000_000_000_002	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV	97	90	100	90	1.65e-54	92.8	100
+smORF_2	GMSC10.90AA.000_000_000_001	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MDELTKMGATIQVDGRTAIITGVEGFTGADVEAPDLRAGAALVIAGLAAKGFTTVSEIGYISRGYEDFEKKLRSLGGEIKMVNDEKEIAKFKLKIG	97	96	81.1	95	1.39e-45	97.9	99.0
+smORF_2	GMSC10.90AA.000_000_000_000	MDELTKMGARIQVDGRTAIITGVKLFTGADVSAPDLRAGAALVIAGLAADGYTTVSDIGYIYRGYEGFEKKIQNLGGDIQLVNSEKEIARFKLRIV*	MDALTKMGATIQVDGRTAIISGVEGFTGADVHAPDLRAGAALVIAGLSAKGFTTVSDIGYIYRGYEQFEQKLKQLGGEIQLVNNEKEVAKFKLKIG	97	96	80.0	95	1.97e-45	97.9	99.0
diff --git a/tests/test_habitat.npy b/tests/test_habitat.npy
diff --git a/tests/test_habitat.py b/tests/test_habitat.py
@@ -5,16 +5,17 @@
 import os
 
 known_habitat = {"qseqid":"habitat",
-                 "smORF_0":"human gut",
-                 "smORF_1":"marine,wastewater,water associated",
-                 "smORF_2":"marine,soil,water associated"}
+                 "smORF_0":"soil",
+                 "smORF_1":"marine,water associated",
+                 "smORF_2":"human gut,marine,soil,wastewater,water associated"}
 habitat_dict = {}
 def test_habitat():
-    smorf_habitat(os.path.dirname(os.path.realpath(__file__)),'./tests/test_habitat.txt','./tests/alignment.tsv')
+    smorf_habitat('./tests/test_habitat_index.txt', os.path.dirname(os.path.realpath(__file__)), './tests/test_habitat.npy', './tests/alignment.tsv')
     with open('./tests/habitat.out.smorfs.tsv',"rt") as f:
         for line in f:
             qseqid,habitat = line.strip().split("\t")
             habitat_dict[qseqid] = habitat
+    print(habitat_dict)
     assert habitat_dict == known_habitat
 
 if __name__ == '__main__':
diff --git a/tests/test_habitat.txt b/tests/test_habitat.txt
diff --git a/tests/test_habitat_index.txt b/tests/test_habitat_index.txt
@@ -0,0 +1,4 @@
+0	human gut
+1	marine,wastewater,water associated
+2	marine,water associated
+3	soil
diff --git a/tests/test_quality.txt b/tests/test_quality.txt
@@ -1,2 +1,2 @@
-GMSC10.90AA.000_257_823_465
-GMSC10.90AA.000_287_349_677
+GMSC10.90AA.000_000_000_000
+GMSC10.90AA.000_000_000_004
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
@@ -4,11 +4,11 @@
 import pytest
 import os
 
-known_mapped_taxonomy = ["smORF_0\tGMSC10.90AA.000_257_823_465\td__Bacteria",
-                         "smORF_1\tGMSC10.90AA.000_279_368_202\t",
-                         "smORF_2\tGMSC10.90AA.000_276_471_764\td__Bacteria;p__Firmicutes_A",
-                         "smORF_2\tGMSC10.90AA.000_265_853_435\t",
-                         "smORF_2\tGMSC10.90AA.000_287_349_677\td__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465"]
+known_mapped_taxonomy = ["smORF_0\tGMSC10.90AA.000_000_000_004\td__Bacteria",
+                         "smORF_1\tGMSC10.90AA.000_000_000_003\t",
+                         "smORF_2\tGMSC10.90AA.000_000_000_002\td__Bacteria;p__Firmicutes_A",
+                         "smORF_2\tGMSC10.90AA.000_000_000_001\t",
+                         "smORF_2\tGMSC10.90AA.000_000_000_000\td__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465"]
 mapped_taxonomy = []
 def test_smorf_taxonomy():
     taxonomy_file = smorf_taxonomy('./tests/test_taxonomy.txt','./tests/alignment.tsv',os.path.dirname(os.path.realpath(__file__)))
diff --git a/tests/test_taxonomy.txt b/tests/test_taxonomy.txt
@@ -1,5 +1,5 @@
-GMSC10.90AA.000_257_823_465	d__Bacteria
-GMSC10.90AA.000_279_368_202	
-GMSC10.90AA.000_276_471_764	d__Bacteria;p__Firmicutes_A
-GMSC10.90AA.000_265_853_435	
-GMSC10.90AA.000_287_349_677	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465
+GMSC10.90AA.000_000_000_000	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__Microbacterium sp003476465
+GMSC10.90AA.000_000_000_001	
+GMSC10.90AA.000_000_000_002	d__Bacteria;p__Firmicutes_A
+GMSC10.90AA.000_000_000_003	
+GMSC10.90AA.000_000_000_004	d__Bacteria