ENH add cdd annotation

cocodyq · cocodyq · commit 91d8fdf48303 · 2023-07-05T15:55:27.000+08:00
diff --git a/README.md b/README.md
@@ -49,42 +49,44 @@ Because the whole GMSC database is large, and takes some minutes to process.
 
 If you want to check if the installation works well, you can test with mock datasets easily and fast.
 
+Please make `GMSC-mapper` as your work directory.
+
 - Create GMSC database index
 
 Default alignment tool is Diamond.
 
 ```bash
-gmsc-mapper createdb -i examples/target.faa -o examples/ -m diamond
+gmsc-mapper createdb -i ./examples/target.faa -o ./examples/ -m diamond
 ```
 
 If you want to use MMseqs2 as your alignment tool, you need to create GMSC database index in MMseqs2 format.
 
 ```bash
-gmsc-mapper createdb -i examples/target.faa -o examples/ -m mmseqs
+gmsc-mapper createdb -i ./examples/target.faa -o ./examples/ -m mmseqs
 ```
 
 - Input is genome contig sequences.
 
 ```bash
-gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv
+gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --domain ./examples/ref_domain.txt
 ```
 
 - Input is amino acid sequences.
 
 ```bash
-gmsc-mapper --aa-genes ./examples/example.faa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv
+gmsc-mapper --aa-genes ./examples/example.faa -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --domain ./examples/ref_domain.txt
 ```
 
 - Input is nucleotide gene sequences.
 
 ```bash
-gmsc-mapper --nt-genes ./examples/example.fna -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv
+gmsc-mapper --nt-genes ./examples/example.fna -o ./examples_output/ --db ./examples/targetdb.dmnd --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --domain ./examples/ref_domain.txt
 ```
 
 - Check another alignment tool: MMseqs2
 
 ```bash
-gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --tool mmseqs
+gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targetdb --habitat ./examples/ref_habitat.npy --habitat-index ./examples/ref_habitat_index.tsv --quality ./examples/ref_quality.txt --taxonomy ./examples/ref_taxonomy.npy --taxonomy-index ./examples/ref_taxonomy_index.tsv --domain ./examples/ref_domain.txt --tool mmseqs
 ```
 
 ## Usage
@@ -106,7 +108,7 @@ gmsc-mapper createdb -i ../db/90AA_GMSC.faa.gz -m mmseqs
 ```
 
 ### Default
-GMSC database / habitat / taxonomy / quality file path and output directory path can be assigned on your own.Default is `GMSC-mapper/db` and `GMSC-mapper/output`.
+GMSC database / habitat / taxonomy / quality / domain file path and output directory path can be assigned on your own.Default is `GMSC-mapper/db` and `GMSC-mapper/output`.
 
 1. Input is genome contig sequences.
 
@@ -133,11 +135,11 @@ If you want to change alignment tool (Diamond / MMseqs2), you can use `--tool`.
 gmsc-mapper -i ../examples/example.fa --tool mmseqs
 ```
 
-### Habitat / taxonomy / quality annotation is optional
-If you don't want to annotate habitat / taxonomy / quality you can use `--nohabitat`/`--notaxonomy`/`--noquality`.
+### Habitat / taxonomy / quality / domain annotation is optional
+If you don't want to annotate habitat / taxonomy / quality / domain you can use `--no-habitat`/`--no-taxonomy`/`--no-quality`/`--no-domain`.
 
 ```bash
-gmsc-mapper -i ../examples/example.fa --nohabitat --notaxonomy --noquality
+gmsc-mapper -i ../examples/example.fa --no-habitat --no-taxonomy --no-quality --no-domain
 ```
 
 ## Output files
@@ -191,17 +193,17 @@ The output folder will contain
 
 - Habitat annotation of smORFs (optional) (habitat.out.smorfs.tsv) 
 
-  A file listing the habitat annotation for each smORF homologous to GMSC.
+  This file lists the habitat annotations of the query/predicted sequence, where the habitat is obtained from the sequence annotations of its homologous origin in GMSC.
 
   There are two columns in the file:
 
   `qseqid`: Query seq id
 
-  `habitat`: Habitat, ',' separated if the sequences is from multiple habitats
+  `habitat`: Habitat, ',' separated if the sequence is from multiple habitats
 
 - Taxonomy annotation of smORFs (optional) (taxonomy.out.smorfs.tsv)
 
-  A file listing the taxonomy annotation for each smORF homologous to GMSC.
+  This file lists the taxonomy annotations of the query/predicted sequence, where the taxonomy is obtained from the sequence annotations of its homologous origin in GMSC.
 
   There are two columns in the file:
 
@@ -211,13 +213,21 @@ The output folder will contain
 
 - Quality annotation of smORFs (optional) (quality.out.smorfs.tsv)
 
-  A file listing the quality annotation for each smORF homologous to GMSC.
+  This file lists the quality annotations of the query/predicted sequence, where the quality is obtained from the sequence annotations of its homologous origin in GMSC.
 
   `qseqid`: Query seq id
 
   `quality`: Quality label
 
-- Summry (summary.txt)
+- Conserved domain annotation of smORFs (optional) (domain.out.smorfs.tsv)
+
+  This file lists the conservative domain annotations of the query/predicted sequence, where the conservative domain is obtained from the sequence annotations of its homologous origin in GMSC.
+
+  `qseqid`: Query seq id
+
+  `cdd`: Identifiers from Conserved domain database, ',' separated if the sequence is annotated with multiple conserved domains.
+
+- Summary (summary.txt)
 
   A file providing a human-readable summary of the results.
 
@@ -244,11 +254,13 @@ The output folder will contain
 
 * `--filter`: Use this to filter <100 aa or <303 nt input sequences. (default: False)
 
-* `--nohabitat`: Use this if no need to annotate habitat. (default: False)
+* `--no-habitat`: Use this if no need to annotate habitat. (default: False)
+
+* `--no-taxonomy`: Use this if no need to annotate taxonomy. (default: False)
 
-* `--notaxonomy`: Use this if no need to annotate taxonomy. (default: False)
+* `--no-quality`: Use this if no need to annotate quality. (default: False)
 
-* `--noquality`: Use this if no need to annotate quality. (default: False)
+* `--no-domain`: Use this if no need to annotate conserved domain. (default: False)
 
 * `--quiet`: Disable alignment console output. (default:False)
 
diff --git a/examples/output/domain.out.smorfs.tsv b/examples/output/domain.out.smorfs.tsv
@@ -0,0 +1,29 @@
+qseqid	cdd
+smORF_00	198061,429887
+smORF_01	
+smORF_02	197696
+smORF_05	
+smORF_06	381607
+smORF_07	
+smORF_08	
+smORF_09	425695
+smORF_10	
+smORF_11	
+smORF_12	
+smORF_13	
+smORF_14	433792
+smORF_15	
+smORF_16	
+smORF_17	
+smORF_18	
+smORF_19	426994
+smORF_20	429147
+smORF_21	
+smORF_22	
+smORF_24	
+smORF_25	
+smORF_26	425742
+smORF_27	429147
+smORF_28	
+smORF_29	197911,429147
+smORF_30	
diff --git a/examples/output/summary.txt b/examples/output/summary.txt
@@ -3,18 +3,22 @@
 28 smORFs are aligned against GMSC in total.
 
 # Quality
-8(28.57%) aligned smORFs are high quality.
+8 (28.57%) aligned smORFs are high quality.
 
 # Habitat
-25(89.29%) aligned smORFs are single-habitat.
-3(10.71%) aligned smORFs are multi-habitat.
+25 (89.29%) aligned smORFs are single-habitat.
+3 (10.71%) aligned smORFs are multi-habitat.
 
 # Taxonomy
-8(28.57%) aligned smORFs have taxonomy annotation.
-3(10.71%) aligned smORFs are annotated on kingdom.
-0(0%) aligned smORFs are annotated on phylum.
-1(3.57%) aligned smORFs are annotated on class.
-1(3.57%) aligned smORFs are annotated on order.
-0(0%) aligned smORFs are annotated on family.
-0(0%) aligned smORFs are annotated on genus.
-3(10.71%) aligned smORFs are annotated on species.
+8 (28.57%) aligned smORFs have taxonomy annotation.
+3 (10.71%) aligned smORFs are annotated at level of kingdom.
+0 (0.00%) aligned smORFs are annotated at level of phylum.
+1 (3.57%) aligned smORFs are annotated at level of class.
+1 (3.57%) aligned smORFs are annotated at level of order.
+0 (0.00%) aligned smORFs are annotated at level of family.
+0 (0.00%) aligned smORFs are annotated at level of genus.
+3 (10.71%) aligned smORFs are annotated at level of species.
+
+# Conserved domain
+10 of aligned smORFs are annotated with CDD database.
+
diff --git a/examples/ref_domain.txt b/examples/ref_domain.txt
@@ -0,0 +1,10 @@
+GMSC10.90AA.000_000_000_080	198061,429887
+GMSC10.90AA.000_000_000_082	197696
+GMSC10.90AA.000_000_000_088	381607
+GMSC10.90AA.000_000_000_016	425695
+GMSC10.90AA.000_000_000_032	433792
+GMSC10.90AA.000_000_000_044	426994
+GMSC10.90AA.000_000_000_065	425742
+GMSC10.90AA.000_000_000_078	429147,197911
+GMSC10.90AA.000_000_000_068	429147
+GMSC10.90AA.000_000_000_047	429147
diff --git a/gmsc_mapper/main.py b/gmsc_mapper/main.py
@@ -103,12 +103,14 @@ def parse_args(args):
 
     parser.add_argument('--filter','--filter',action='store_true', help='Use this to filter <100 aa or <303 nt input sequences.')
 
-    parser.add_argument('--nohabitat','--nohabitat',action='store_true', help='Use this if no need to annotate habitat')
+    parser.add_argument('--no-habitat','--no-habitat',action='store_true', dest='nohabitat', help='Use this if no need to annotate habitat')
 
-    parser.add_argument('--notaxonomy', '--notaxonomy',action='store_true', help='Use this if no need to annotate taxonomy')
+    parser.add_argument('--no-taxonomy', '--no-taxonomy',action='store_true', dest='notaxonomy', help='Use this if no need to annotate taxonomy')
 
-    parser.add_argument('--noquality', '--noquality',action='store_true', help='Use this if no need to annotate quality')
+    parser.add_argument('--no-quality', '--no-quality',action='store_true', dest='noquality', help='Use this if no need to annotate quality')
     
+    parser.add_argument('--no-domain', '--no-domain',action='store_true', dest='nodomain', help='Use this if no need to annotate quality')
+
     parser.add_argument('--quiet','--quiet',action='store_true', help='Disable alignment console output')
 
     parser.add_argument('--db', '--db',
@@ -145,6 +147,12 @@ def parse_args(args):
                         help='Path to the quality file',
                         dest='quality',
                         default=path.join(_ROOT, 'db/ref_quality.tsv.xz'))
+    
+    parser.add_argument('--domain', '--domain',
+                        required=False,
+                        help='Path to the conserved domain file',
+                        dest='domain',
+                        default=path.join(_ROOT, 'db/ref_domain.tsv.xz'))
 
     return parser.parse_args(args[1:])
 
@@ -213,6 +221,9 @@ def expect_file(f):
     if not args.noquality and args.quality:
         expect_file(args.quality)
 
+    if not args.nodomain and args.domain:
+        expect_file(args.domain)
+
 def create_db(args):
     if not os.path.exists(args.output):
         os.makedirs(args.output)
@@ -395,6 +406,13 @@ def quality(args,resultfile):
     logger.info('Quality annotation completed.')
     return number,percentage
 
+def domain(args,resultfile):
+    from gmsc_mapper.map_domain import smorf_domain
+    logger.debug('Start domain annotation...')
+    number = smorf_domain(args.domain,args.output,resultfile)
+    logger.info('Domain annotation completed.')
+    return number
+
 def predicted_smorf_count(file_name):
     return sum(1 for _ in open(file_name, 'rt'))
 
@@ -485,6 +503,11 @@ def main(args=None):
                             summary.append(f'{annotated_number} ({1 - rank_percentage["no rank"]:.2%}) aligned smORFs have taxonomy annotation.')
                             for rank in ['kingdom','phylum','class','order','family','genus','species']:
                                 summary.append(f'{rank_number[rank]} ({rank_percentage[rank]:.2%}) aligned smORFs are annotated at level of {rank}.')
+
+                        if not args.nodomain:
+                            summary.append(f'\n# Conserved domain')
+                            number = domain(args,resultfile)
+                            summary.append(f'{number} aligned smORFs are annotated with CDD database.\n')
                     else:
                         summary.append(f'None of sequences are aligned against GMSC.\n')
                 
diff --git a/gmsc_mapper/map_domain.py b/gmsc_mapper/map_domain.py
@@ -0,0 +1,51 @@
+import pandas as pd
+from os import path
+
+def fixdf(x):
+    x = x.dropna()
+    x = x.drop_duplicates()
+    return ','.join(x)
+
+def formatlabel(x):
+    x = x.split(',')
+    x = list(set(x))
+    x = sorted(x)
+    return ','.join(x)
+
+def store_domain(cddfile): 
+    cdd = pd.read_table(cddfile, sep='\t', header=None, names=['gmsc','cdd'])
+    cdd_dict = dict(zip(cdd['gmsc'],cdd['cdd']))
+    return cdd_dict
+
+def smorf_domain(cddfile, outdir, resultfile):
+    cdd_file = path.join(outdir, "domain.out.smorfs.tsv")
+
+    result = pd.read_csv(resultfile,
+                         sep='\t',
+                         header=None)                       
+    result.rename({0: 'qseqid', 1: 'sseqid'},
+                  axis=1,
+                  inplace=True)
+    mapped_sseqid = result['sseqid'].to_list()
+
+    cdd_dict = store_domain(cddfile)
+
+    mapped_sseqid_cdd = {}
+    for item in mapped_sseqid:
+        if item in cdd_dict.keys():
+            mapped_sseqid_cdd[item] = cdd_dict[item]
+    result['cdd'] = result['sseqid'].map(lambda g: mapped_sseqid_cdd.get(g))
+
+    output = result[['qseqid', 'cdd']]
+    output = output.sort_values(by='qseqid')
+    output = output.groupby('qseqid',
+                            as_index=False,
+                            sort=False) 
+    output = output.agg({'cdd':lambda x : fixdf(x)})
+    output['cdd'] = output['cdd'].apply(lambda x: formatlabel(x))
+    output.to_csv(cdd_file,
+                  sep='\t',
+                  index=False)    
+    
+    annotated = output[output['cdd']!='']['cdd'].count()
+    return annotated
diff --git a/tests/test_domain.py b/tests/test_domain.py
@@ -0,0 +1,19 @@
+from gmsc_mapper.map_domain import smorf_domain
+import pytest
+import os
+
+known_domain = {"qseqid":"cdd",
+                 "smORF_0":"197696",
+                 "smORF_1":"",
+                 "smORF_2":"198061,429147,429887"}
+domain_dict={}
+def test_smorf_domain():
+    smorf_domain('./tests/test_domain.txt',os.path.dirname(os.path.realpath(__file__)),'./tests/alignment.tsv')
+    with open('./tests/domain.out.smorfs.tsv',"rt") as f:
+        for line in f:
+            qseqid,domain = line.split("\t")
+            domain_dict[qseqid] = domain.strip()
+    assert domain_dict == known_domain
+
+if __name__ == '__main__':
+    pytest.main()
diff --git a/tests/test_domain.txt b/tests/test_domain.txt
@@ -0,0 +1,3 @@
+GMSC10.90AA.000_000_000_000	198061,429887
+GMSC10.90AA.000_000_000_001	429147,429887
+GMSC10.90AA.000_000_000_004	197696

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+GMSC10.90AA.000_000_000_000 198061,429887`
	`2`	`+GMSC10.90AA.000_000_000_001 429147,429887`
	`3`	`+GMSC10.90AA.000_000_000_004 197696`