Skip to content

Commit 65bafb5

Browse files
committed
ENH add download db
1 parent 91d8fdf commit 65bafb5

File tree

3 files changed

+124
-21
lines changed

3 files changed

+124
-21
lines changed

README.md

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,30 @@ gmsc-mapper -i ./examples/example.fa -o ./examples_output/ --db ./examples/targe
9090
```
9191

9292
## Usage
93-
Please make `GMSC-mapper/gmsc_mapper` as your work directory.
93+
Please make `GMSC-mapper` as your work directory.
94+
95+
### Download GMSC database
96+
`--dbdir`: Path to database output directory.(default: `GMSC-mapper/db`)
97+
98+
`--all`: Download all database
99+
100+
`-f`: Force download even if the files exist
101+
102+
```
103+
gmsc-mapper downloaddb
104+
```
94105

95-
### Create GMSC database index
106+
### Create GMSC database index of Diamond/MMseqs2
96107
`-o`: Path to database output directory.(default: `GMSC-mapper/db`)
97108

98109
`-m`: Alignment tool (Diamond / MMseqs2).
99110

100111
```
101-
cd gmsc_mapper
102-
gmsc-mapper createdb -i ../db/90AA_GMSC.faa.gz -m diamond
112+
gmsc-mapper createdb -i ./db/GMSC10.90AA.faa.gz -m diamond
103113
```
104114
or
105115
```
106-
cd gmsc_mapper
107-
gmsc-mapper createdb -i ../db/90AA_GMSC.faa.gz -m mmseqs
116+
gmsc-mapper createdb -i ./db/GMSC10.90AA.faa.gz -m mmseqs
108117
```
109118

110119
### Default
@@ -113,33 +122,33 @@ GMSC database / habitat / taxonomy / quality / domain file path and output direc
113122
1. Input is genome contig sequences.
114123

115124
```bash
116-
gmsc-mapper -i ../examples/example.fa
125+
gmsc-mapper -i ./examples/example.fa
117126
```
118127

119128
2. Input is amino acid sequences.
120129

121130
```bash
122-
gmsc-mapper --aa-genes ../examples/example.faa
131+
gmsc-mapper --aa-genes ./examples/example.faa
123132
```
124133

125134
3. Input is nucleotide gene sequences.
126135

127136
```bash
128-
gmsc-mapper --nt-genes ../examples/example.fna
137+
gmsc-mapper --nt-genes ./examples/example.fna
129138
```
130139

131140
### Alignment tool: Diamond / MMseqs2 is optional
132141
If you want to change alignment tool (Diamond / MMseqs2), you can use `--tool`.
133142

134143
```bash
135-
gmsc-mapper -i ../examples/example.fa --tool mmseqs
144+
gmsc-mapper -i ./examples/example.fa --tool mmseqs
136145
```
137146

138147
### Habitat / taxonomy / quality / domain annotation is optional
139148
If you don't want to annotate habitat / taxonomy / quality / domain you can use `--no-habitat`/`--no-taxonomy`/`--no-quality`/`--no-domain`.
140149

141150
```bash
142-
gmsc-mapper -i ../examples/example.fa --no-habitat --no-taxonomy --no-quality --no-domain
151+
gmsc-mapper -i ./examples/example.fa --no-habitat --no-taxonomy --no-quality --no-domain
143152
```
144153

145154
## Output files

gmsc_mapper/main.py

Lines changed: 88 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from atomicwrites import atomic_write
99
import logging
1010

11-
_ROOT = path.abspath(path.join(os.getcwd(), ".."))
11+
_ROOT = path.abspath(path.join(os.getcwd(), "."))
1212

1313
logger = logging.getLogger('GMSC-mapper')
1414

@@ -18,7 +18,20 @@ def parse_args(args):
1818
subparsers = parser.add_subparsers(title='GMSC-mapper subcommands',
1919
dest='cmd',
2020
metavar='')
21-
21+
22+
cmd_download_db = subparsers.add_parser('downloaddb',
23+
help='Download target database')
24+
25+
cmd_download_db.add_argument('--dbdir',
26+
required=False,
27+
help='Path to the database files.',
28+
dest='dbdir',
29+
default = path.join(_ROOT, 'db'))
30+
cmd_download_db.add_argument('--all', action="store_true", dest='all',
31+
help='Download all database')
32+
cmd_download_db.add_argument('-f', action="store_true", dest='force',
33+
help='Force download even if the files exist')
34+
2235
cmd_create_db = subparsers.add_parser('createdb',
2336
help='Create target database')
2437
cmd_create_db.add_argument('-i',
@@ -116,43 +129,44 @@ def parse_args(args):
116129
parser.add_argument('--db', '--db',
117130
required=False,
118131
help='Path to the GMSC database file.',
119-
dest='database')
132+
dest='database',
133+
default=path.join(_ROOT, 'db/targetdb.dmnd'))
120134

121135
parser.add_argument('--habitat', '--habitat',
122136
required=False,
123137
help='Path to the habitat file',
124138
dest='habitat',
125-
default=path.join(_ROOT, 'db/ref_habitat.npy'))
139+
default=path.join(_ROOT, 'db/GMSC10.90AA.habitat.npy'))
126140

127141
parser.add_argument('--habitat-index', '--habitat-index',
128142
required=False,
129143
help='Path to the habitat index file',
130144
dest='habitatindex',
131-
default=path.join(_ROOT, 'db/ref_habitat_index.tsv'))
145+
default=path.join(_ROOT, 'db/GMSC10.90AA.habitat.index.tsv'))
132146

133147
parser.add_argument('--taxonomy', '--taxonomy',
134148
required=False,
135149
help='Path to the taxonomy file',
136150
dest='taxonomy',
137-
default=path.join(_ROOT, 'db/ref_taxonomy.npy'))
151+
default=path.join(_ROOT, 'db/GMSC10.90AA.taxonomy.npy'))
138152

139153
parser.add_argument('--taxonomy-index', '--taxonomy-index',
140154
required=False,
141155
help='Path to the taxonomy index file',
142156
dest='taxonomyindex',
143-
default=path.join(_ROOT, 'db/ref_taxonomy_index.tsv'))
157+
default=path.join(_ROOT, 'db/GMSC10.90AA.taxonomy.index.tsv'))
144158

145159
parser.add_argument('--quality', '--quality',
146160
required=False,
147161
help='Path to the quality file',
148162
dest='quality',
149-
default=path.join(_ROOT, 'db/ref_quality.tsv.xz'))
163+
default=path.join(_ROOT, 'db/GMSC10.90AA.high_quality.tsv.xz'))
150164

151165
parser.add_argument('--domain', '--domain',
152166
required=False,
153167
help='Path to the conserved domain file',
154168
dest='domain',
155-
default=path.join(_ROOT, 'db/ref_domain.tsv.xz'))
169+
default=path.join(_ROOT, 'db/GMSC10.90AA.cdd.tsv.xz'))
156170

157171
return parser.parse_args(args[1:])
158172

@@ -224,6 +238,68 @@ def expect_file(f):
224238
if not args.nodomain and args.domain:
225239
expect_file(args.domain)
226240

241+
def download_db(args):
242+
from gmsc_mapper.utils import ask
243+
244+
if args.force or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.faa.gz')):
245+
if args.all or ask("Download 90AA fasta file (~11G)?") == 'y':
246+
logger.info('Start downloading 90AA fasta file...')
247+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.faa.gz',
248+
'-P',args.dbdir])
249+
logger.info('90AA fasta file has been downloaded successfully.')
250+
else:
251+
print('Skip downloading 90AA fasta file.')
252+
else:
253+
print('90AA fasta file already exists. Skip downloading 90AA fasta file. Use -f to force download')
254+
255+
if args.force or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.habitat.npy')) or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.habitat.index.tsv')):
256+
if args.all or ask("Download habitat index file (~2.3G)?") == 'y':
257+
logger.info('Start downloading habitat index file...')
258+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.habitat.npy',
259+
'-P',args.dbdir])
260+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.habitat.index.tsv',
261+
'-P',args.dbdir])
262+
logger.info('Habitat index file has been downloaded successfully.')
263+
else:
264+
print('Skip downloading habitat index file.')
265+
else:
266+
print('Habitat index file already exists. Skip downloading habitat index file. Use -f to force download')
267+
268+
if args.force or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.taxonomy.npy')) or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.taxonomy.index.tsv')):
269+
if args.all or ask("Download taxonomy index file (~2.3G)?") == 'y':
270+
logger.info('Start downloading taxonomy index file...')
271+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.taxonomy.npy',
272+
'-P',args.dbdir])
273+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.taxonomy.index.tsv',
274+
'-P',args.dbdir])
275+
logger.info('Taxonomy index file has been downloaded successfully.')
276+
else:
277+
print('Skip downloading taxonomy index file.')
278+
else:
279+
print('Taxonomy index file already exists. Skip downloading taxonomy index file. Use -f to force download')
280+
281+
if args.force or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.high_quality.tsv.xz')):
282+
if args.all or ask("Download quality index file (2.6M)?") == 'y':
283+
logger.info('Start downloading quality index file...')
284+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.high_quality.tsv.xz',
285+
'-P',args.dbdir])
286+
logger.info('Quality index file has been downloaded successfully.')
287+
else:
288+
print('Skip downloading quality index file.')
289+
else:
290+
print('Quality index file already exists. Skip downloading quality index file. Use -f to force download')
291+
292+
if args.force or not os.path.exists(os.path.join(args.dbdir,'GMSC10.90AA.cdd.tsv.xz')):
293+
if args.all or ask("Download conserved domain index file (88M)?") == 'y':
294+
logger.info('Start downloading conserved domain index file...')
295+
subprocess.check_call(['wget','https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.cdd.tsv.xz',
296+
'-P',args.dbdir])
297+
logger.info('Conserved domain index file has been downloaded successfully.')
298+
else:
299+
print('Skip downloading conserved domain index file.')
300+
else:
301+
print('Conserved domain index file already exists. Skip downloading conserved domain index file. Use -f to force download')
302+
227303
def create_db(args):
228304
if not os.path.exists(args.output):
229305
os.makedirs(args.output)
@@ -427,6 +503,9 @@ def main(args=None):
427503

428504
if args.cmd == 'createdb':
429505
create_db(args)
506+
507+
if args.cmd == 'downloaddb':
508+
download_db(args)
430509

431510
if not args.cmd:
432511
validate_args(args,has_diamond,has_mmseqs)

gmsc_mapper/utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,19 @@ def open_output(ofile, mode='wt'):
1414
return
1515

1616
with atomic_write(ofile, mode=mode, overwrite=True) as out:
17-
yield out
17+
yield out
18+
19+
def ask(string, valid_values=None, default=-1, case_sensitive=False):
20+
""" Asks for a keyborad answer """
21+
if not valid_values:
22+
valid_values = ['y', 'n']
23+
v = None
24+
if not case_sensitive:
25+
valid_values = [value.lower() for value in valid_values]
26+
while v not in valid_values:
27+
v = input("%s [%s] " % (string,','.join(valid_values) ))
28+
if v == '' and default >= 0:
29+
v = valid_values[default]
30+
if not case_sensitive:
31+
v = v.lower()
32+
return v

0 commit comments

Comments
 (0)