8
8
from atomicwrites import atomic_write
9
9
import logging
10
10
11
- _ROOT = path .abspath (path .join (os .getcwd (), ".. " ))
11
+ _ROOT = path .abspath (path .join (os .getcwd (), "." ))
12
12
13
13
logger = logging .getLogger ('GMSC-mapper' )
14
14
@@ -18,7 +18,20 @@ def parse_args(args):
18
18
subparsers = parser .add_subparsers (title = 'GMSC-mapper subcommands' ,
19
19
dest = 'cmd' ,
20
20
metavar = '' )
21
-
21
+
22
+ cmd_download_db = subparsers .add_parser ('downloaddb' ,
23
+ help = 'Download target database' )
24
+
25
+ cmd_download_db .add_argument ('--dbdir' ,
26
+ required = False ,
27
+ help = 'Path to the database files.' ,
28
+ dest = 'dbdir' ,
29
+ default = path .join (_ROOT , 'db' ))
30
+ cmd_download_db .add_argument ('--all' , action = "store_true" , dest = 'all' ,
31
+ help = 'Download all database' )
32
+ cmd_download_db .add_argument ('-f' , action = "store_true" , dest = 'force' ,
33
+ help = 'Force download even if the files exist' )
34
+
22
35
cmd_create_db = subparsers .add_parser ('createdb' ,
23
36
help = 'Create target database' )
24
37
cmd_create_db .add_argument ('-i' ,
@@ -116,43 +129,44 @@ def parse_args(args):
116
129
parser .add_argument ('--db' , '--db' ,
117
130
required = False ,
118
131
help = 'Path to the GMSC database file.' ,
119
- dest = 'database' )
132
+ dest = 'database' ,
133
+ default = path .join (_ROOT , 'db/targetdb.dmnd' ))
120
134
121
135
parser .add_argument ('--habitat' , '--habitat' ,
122
136
required = False ,
123
137
help = 'Path to the habitat file' ,
124
138
dest = 'habitat' ,
125
- default = path .join (_ROOT , 'db/ref_habitat .npy' ))
139
+ default = path .join (_ROOT , 'db/GMSC10.90AA.habitat .npy' ))
126
140
127
141
parser .add_argument ('--habitat-index' , '--habitat-index' ,
128
142
required = False ,
129
143
help = 'Path to the habitat index file' ,
130
144
dest = 'habitatindex' ,
131
- default = path .join (_ROOT , 'db/ref_habitat_index .tsv' ))
145
+ default = path .join (_ROOT , 'db/GMSC10.90AA.habitat.index .tsv' ))
132
146
133
147
parser .add_argument ('--taxonomy' , '--taxonomy' ,
134
148
required = False ,
135
149
help = 'Path to the taxonomy file' ,
136
150
dest = 'taxonomy' ,
137
- default = path .join (_ROOT , 'db/ref_taxonomy .npy' ))
151
+ default = path .join (_ROOT , 'db/GMSC10.90AA.taxonomy .npy' ))
138
152
139
153
parser .add_argument ('--taxonomy-index' , '--taxonomy-index' ,
140
154
required = False ,
141
155
help = 'Path to the taxonomy index file' ,
142
156
dest = 'taxonomyindex' ,
143
- default = path .join (_ROOT , 'db/ref_taxonomy_index .tsv' ))
157
+ default = path .join (_ROOT , 'db/GMSC10.90AA.taxonomy.index .tsv' ))
144
158
145
159
parser .add_argument ('--quality' , '--quality' ,
146
160
required = False ,
147
161
help = 'Path to the quality file' ,
148
162
dest = 'quality' ,
149
- default = path .join (_ROOT , 'db/ref_quality .tsv.xz' ))
163
+ default = path .join (_ROOT , 'db/GMSC10.90AA.high_quality .tsv.xz' ))
150
164
151
165
parser .add_argument ('--domain' , '--domain' ,
152
166
required = False ,
153
167
help = 'Path to the conserved domain file' ,
154
168
dest = 'domain' ,
155
- default = path .join (_ROOT , 'db/ref_domain .tsv.xz' ))
169
+ default = path .join (_ROOT , 'db/GMSC10.90AA.cdd .tsv.xz' ))
156
170
157
171
return parser .parse_args (args [1 :])
158
172
@@ -224,6 +238,68 @@ def expect_file(f):
224
238
if not args .nodomain and args .domain :
225
239
expect_file (args .domain )
226
240
241
+ def download_db (args ):
242
+ from gmsc_mapper .utils import ask
243
+
244
+ if args .force or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.faa.gz' )):
245
+ if args .all or ask ("Download 90AA fasta file (~11G)?" ) == 'y' :
246
+ logger .info ('Start downloading 90AA fasta file...' )
247
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.faa.gz' ,
248
+ '-P' ,args .dbdir ])
249
+ logger .info ('90AA fasta file has been downloaded successfully.' )
250
+ else :
251
+ print ('Skip downloading 90AA fasta file.' )
252
+ else :
253
+ print ('90AA fasta file already exists. Skip downloading 90AA fasta file. Use -f to force download' )
254
+
255
+ if args .force or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.habitat.npy' )) or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.habitat.index.tsv' )):
256
+ if args .all or ask ("Download habitat index file (~2.3G)?" ) == 'y' :
257
+ logger .info ('Start downloading habitat index file...' )
258
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.habitat.npy' ,
259
+ '-P' ,args .dbdir ])
260
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.habitat.index.tsv' ,
261
+ '-P' ,args .dbdir ])
262
+ logger .info ('Habitat index file has been downloaded successfully.' )
263
+ else :
264
+ print ('Skip downloading habitat index file.' )
265
+ else :
266
+ print ('Habitat index file already exists. Skip downloading habitat index file. Use -f to force download' )
267
+
268
+ if args .force or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.taxonomy.npy' )) or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.taxonomy.index.tsv' )):
269
+ if args .all or ask ("Download taxonomy index file (~2.3G)?" ) == 'y' :
270
+ logger .info ('Start downloading taxonomy index file...' )
271
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.taxonomy.npy' ,
272
+ '-P' ,args .dbdir ])
273
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.taxonomy.index.tsv' ,
274
+ '-P' ,args .dbdir ])
275
+ logger .info ('Taxonomy index file has been downloaded successfully.' )
276
+ else :
277
+ print ('Skip downloading taxonomy index file.' )
278
+ else :
279
+ print ('Taxonomy index file already exists. Skip downloading taxonomy index file. Use -f to force download' )
280
+
281
+ if args .force or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.high_quality.tsv.xz' )):
282
+ if args .all or ask ("Download quality index file (2.6M)?" ) == 'y' :
283
+ logger .info ('Start downloading quality index file...' )
284
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.high_quality.tsv.xz' ,
285
+ '-P' ,args .dbdir ])
286
+ logger .info ('Quality index file has been downloaded successfully.' )
287
+ else :
288
+ print ('Skip downloading quality index file.' )
289
+ else :
290
+ print ('Quality index file already exists. Skip downloading quality index file. Use -f to force download' )
291
+
292
+ if args .force or not os .path .exists (os .path .join (args .dbdir ,'GMSC10.90AA.cdd.tsv.xz' )):
293
+ if args .all or ask ("Download conserved domain index file (88M)?" ) == 'y' :
294
+ logger .info ('Start downloading conserved domain index file...' )
295
+ subprocess .check_call (['wget' ,'https://gmsc-api.big-data-biology.org/files/GMSC10.90AA.cdd.tsv.xz' ,
296
+ '-P' ,args .dbdir ])
297
+ logger .info ('Conserved domain index file has been downloaded successfully.' )
298
+ else :
299
+ print ('Skip downloading conserved domain index file.' )
300
+ else :
301
+ print ('Conserved domain index file already exists. Skip downloading conserved domain index file. Use -f to force download' )
302
+
227
303
def create_db (args ):
228
304
if not os .path .exists (args .output ):
229
305
os .makedirs (args .output )
@@ -427,6 +503,9 @@ def main(args=None):
427
503
428
504
if args .cmd == 'createdb' :
429
505
create_db (args )
506
+
507
+ if args .cmd == 'downloaddb' :
508
+ download_db (args )
430
509
431
510
if not args .cmd :
432
511
validate_args (args ,has_diamond ,has_mmseqs )
0 commit comments