@@ -5,43 +5,80 @@ import shutil
55import tempfile
66import subprocess
77import argparse
8+ import logging
9+
10+ __version__ = '0.6.7'
11+
12+ logging .basicConfig (format = '%(asctime)s - %(message)s' , level = logging .DEBUG )
13+ class CustomFormatter (argparse .ArgumentDefaultsHelpFormatter ,
14+ argparse .RawDescriptionHelpFormatter ):
15+ pass
16+
17+ desc = 'parallel fastq-dump wrapper, extra args will be passed through'
18+ epi = """DESCRIPTION:
19+ Example: parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip
20+ """
21+
22+ parser = argparse .ArgumentParser (description = desc , epilog = epi ,
23+ formatter_class = CustomFormatter )
24+ argparse .ArgumentDefaultsHelpFormatter
25+ parser .add_argument ('-s' ,'--sra-id' , help = 'SRA id' , action = 'append' )
26+ parser .add_argument ('-t' ,'--threads' , help = 'number of threads' , default = 1 , type = int )
27+ parser .add_argument ('-O' ,'--outdir' , help = 'output directory' , default = '.' )
28+ parser .add_argument ('-T' , '--tmpdir' , help = 'temporary directory' , default = None )
29+ parser .add_argument ('-N' ,'--minSpotId' , help = 'Minimum spot id' , default = 1 , type = int )
30+ parser .add_argument ('-X' ,'--maxSpotId' , help = 'Maximum spot id' , default = None , type = int )
31+ parser .add_argument ('-V' , '--version' , help = 'shows version' , action = 'store_true' , default = False )
832
9- __version__ = "0.6.6"
1033
1134def pfd (args , srr_id , extra_args ):
12- tmp_dir = tempfile .TemporaryDirectory (prefix = "pfd_" ,dir = args .tmpdir )
13- sys .stderr .write ("tempdir: {}\n " .format (tmp_dir .name ))
35+ """
36+ Parallel fastq dump
37+ Parameters
38+ ----------
39+ args : dict
40+ User-provided args
41+ srr_id : str
42+ SRR ID
43+ extra_args : dict
44+ Extra args
45+ """
46+ tmp_dir = tempfile .TemporaryDirectory (prefix = 'pfd_' ,dir = args .tmpdir )
47+ logging .info ('tempdir: {}' .format (tmp_dir .name ))
1448
1549 n_spots = get_spot_count (srr_id )
16- sys . stderr . write ( " {} spots: {}\n " .format (srr_id ,n_spots ))
50+ logging . info ( ' {} spots: {}' .format (srr_id ,n_spots ))
1751
1852 # minSpotId cant be lower than 1
1953 start = max (args .minSpotId , 1 )
2054 # maxSpotId cant be higher than n_spots
2155 end = min (args .maxSpotId , n_spots ) if args .maxSpotId is not None else n_spots
2256
2357 blocks = split_blocks (start , end , args .threads )
24- sys . stderr . write ( " blocks: {}\n " .format (blocks ))
25-
58+ logging . info ( ' blocks: {}' .format (blocks ))
59+
2660 ps = []
2761 for i in range (0 ,args .threads ):
2862 d = os .path .join (tmp_dir .name , str (i ))
2963 os .mkdir (d )
30- p = subprocess .Popen (["fastq-dump" , "-N" , str (blocks [i ][0 ]), "-X" , str (blocks [i ][1 ]), "-O" , d ]+ extra_args + [srr_id ])
64+ cmd = ['fastq-dump' , '-N' , str (blocks [i ][0 ]), '-X' , str (blocks [i ][1 ]),
65+ '-O' , d ] + extra_args + [srr_id ]
66+ logging .info ('CMD: {}' .format (' ' .join (cmd )))
67+ p = subprocess .Popen (cmd )
3168 ps .append (p )
3269
3370 wfd = {}
3471 for i in range (0 ,args .threads ):
3572 exit_code = ps [i ].wait ()
3673 if exit_code != 0 :
37- sys . stderr . write ( " fastq-dump error! exit code: {}\n " .format (exit_code ))
74+ logging . warning ( ' fastq-dump error! exit code: {}' .format (exit_code ))
3875 sys .exit (1 )
3976
4077 tmp_path = os .path .join (tmp_dir .name , str (i ))
4178 for fo in os .listdir (tmp_path ):
4279 if fo not in wfd :
43- wfd [fo ] = open (os .path .join (args .outdir ,fo ), "wb" )
44- with open (os .path .join (tmp_path ,fo ), "rb" ) as fd :
80+ wfd [fo ] = open (os .path .join (args .outdir ,fo ), 'wb' )
81+ with open (os .path .join (tmp_path ,fo ), 'rb' ) as fd :
4582 shutil .copyfileobj (fd , wfd [fo ])
4683 os .remove (os .path .join (tmp_path ,fo ))
4784
@@ -61,12 +98,28 @@ def split_blocks(start, end, n_pieces):
6198 return out
6299
63100def get_spot_count (sra_id ):
64- p = subprocess .Popen (["sra-stat" , "--meta" , "--quick" , sra_id ], stdout = subprocess .PIPE )
65- stdout , stderr = p .communicate ()
66- txt = stdout .decode ().rstrip ().split ("\n " )
101+ """
102+ Get spot count via sra-stat
103+ Parameters
104+ ----------
105+ sra_id : str
106+ SRA ID
107+ """
108+ cmd = ['sra-stat' , '--meta' , '--quick' , sra_id ]
109+ logging .info ('CMD: {}' .format (' ' .join (cmd )))
110+ p = subprocess .Popen (cmd , stdout = subprocess .PIPE , stderr = subprocess .PIPE )
111+ stdout , stderr = p .communicate ()
112+ txt = stdout .decode ().rstrip ().split ('\n ' )
67113 total = 0
68- for l in txt :
69- total += int (l .split ("|" )[2 ].split (":" )[0 ])
114+ try :
115+ for l in txt :
116+ total += int (l .split ('|' )[2 ].split (':' )[0 ])
117+ except IndexError :
118+ msg = 'sra-stat output parsing error!'
119+ msg += '\n --sra-stat STDOUT--\n {}'
120+ msg += '\n --sra-stat STDERR--\n {}'
121+ etxt = stderr .decode ().rstrip ().split ('\n ' )
122+ raise IndexError (msg .format ('\n ' .join (txt ), '\n ' .join (etxt )))
70123 return total
71124
72125def partition (f , l ):
@@ -79,46 +132,51 @@ def partition(f, l):
79132 return r
80133
81134def is_sra_file (path ):
135+ """
136+ Determine whether path is SRA file
137+ parameters
138+ ----------
139+ path : str
140+ file path
141+ """
82142 f = os .path .basename (path )
83143 if f .lower ().endswith ('.sra' ): return True
84- if " SRR" in f .upper (): return True
85- if " ERR" in f .upper (): return True
86- if " DRR" in f .upper (): return True
144+ if ' SRR' in f .upper (): return True
145+ if ' ERR' in f .upper (): return True
146+ if ' DRR' in f .upper (): return True
87147 return False
88148
89149def main ():
90- parser = argparse .ArgumentParser (description = "parallel fastq-dump wrapper, extra args will be passed through" )
91- parser .add_argument ("-s" ,"--sra-id" , help = "SRA id" , action = "append" )
92- parser .add_argument ("-t" ,"--threads" , help = "number of threads" , default = 1 , type = int )
93- parser .add_argument ("-O" ,"--outdir" , help = "output directory" , default = "." )
94- parser .add_argument ("--tmpdir" , help = "temporary directory" , default = None )
95- parser .add_argument ("-N" ,"--minSpotId" , help = "Minimum spot id" , default = 1 , type = int )
96- parser .add_argument ("-X" ,"--maxSpotId" , help = "Maximum spot id" , default = None , type = int )
97- parser .add_argument ("-V" , "--version" , help = "shows version" , action = "store_true" )
150+ """
151+ Main interface
152+ """
98153 args , extra = parser .parse_known_args ()
99-
100154 if args .version :
101- print (" parallel-fastq-dump : {}" .format (__version__ ))
102- subprocess .Popen ([" fastq-dump" , "-V" ]).wait ()
155+ print (' parallel-fastq-dump : {}' .format (__version__ ))
156+ subprocess .Popen ([' fastq-dump' , '-V' ]).wait ()
103157 sys .exit (0 )
104158
105159 elif args .sra_id :
106- extra_srrs , extra_args = partition (is_sra_file ,extra )
160+ extra_srrs , extra_args = partition (is_sra_file ,extra )
107161 args .sra_id .extend (extra_srrs )
108- sys .stderr .write ("SRR ids: {}\n " .format (args .sra_id ))
109- sys .stderr .write ("extra args: {}\n " .format (extra_args ))
110-
111- if args .outdir :
112- if not os .path .isdir (args .outdir ):
113- os .mkdir (args .outdir )
114-
162+ logging .info ('SRR ids: {}' .format (args .sra_id ))
163+ logging .info ('extra args: {}' .format (extra_args ))
164+
165+ # output directory
166+ if not os .path .isdir (args .outdir ) and args .outdir != '.' :
167+ os .makedirs (args .outdir )
168+ # temp directory
169+ if (args .tmpdir is not None and
170+ not os .path .isdir (args .tmpdir )
171+ and args .tmpdir != '.' ):
172+ os .makedirs (args .tmpdir )
173+ # fastq dump
115174 for si in args .sra_id :
116175 pfd (args , si , extra_args )
117-
118176 else :
119177 parser .print_help ()
120178 sys .exit (1 )
121179
122- if __name__ == " __main__" :
180+ if __name__ == ' __main__' :
123181 main ()
124182
0 commit comments