44"""
55Description:
66 Large-scale metabolic reconstruction of bacterial genomes.
7-
8- ::
97
108 usage:
11- padmet metabolic_reconstruction [-h] -i INPUT -o OUTPUT --tax TAXFILE --padmet_ref PATH_TO_PADMET_REF --ptsc PTSC --ptsi PTSI [--annot ANNOT] [--egg_path EGG_PATH] [--bak_path BAK_PATH] [-c CPUS] [-k TO_KEEP]
12- -h, --help Show this help message and exit
13- -i INPUT, --input INPUT Path to the folder where the genomes are
14- -o OUTPUT, --output OUTPUT Path to the folder where you want to put the results in
15- --tax TAXFILE Path to the taxon file (.tsv)
16- --padmet_ref PATH_TO_PADMET_REF Path to the reference database in Padmet format.
17- --ptsc PTSC Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
18- --ptsi PTSI Path to the singularity image of mpwt to use.
19- --annot ANNOT Annotation tool(s) to use between 'bakta' (default), 'eggnog' and 'prokka'. If several annotation tools to use, write them comma-separated.
20- --egg_path EGG_PATH Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool.
21- --bak_path BAK_PATH Path to the Bakta database, mandatory if you want to use bakta as annotation tool.
22- -c CPUS, --cpus CPUS Give the number of available CPUs
23- -k TO_KEEP, --keep TO_KEEP Give the file formats to keep - comma-separated list, '.' included
9+ padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --tax FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [-k STR]
10+
11+ -h --help Show this help message and exit
12+ -i --input=FOLDER Path to the folder where the genomes are
13+ -o --output=FOLDER Path to the folder where you want to put the results in
14+ --tax=FILE Path to the taxon file (.tsv)
15+ --padmet_ref=FILE Path to the reference database in Padmet format.
16+ --ptsc=FOLDER Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
17+ --ptsi=FILE Path to the singularity image of mpwt to use.
18+ --annot=STR Annotation tool(s) to use between 'bakta' (default), 'eggnog' and 'prokka'. If several annotation tools to use, write them comma-separated.
19+ --egg_path=FOLDER Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool.
20+ --bak_path=FOLDER Path to the Bakta database, mandatory if you want to use bakta as annotation tool.
21+ -c --cpus=INT Give the number of available CPUs
22+ -k --keep=STR Give the file formats to keep - comma-separated list, '.' included
2423"""
2524
2625import os
@@ -42,26 +41,10 @@ def command_help():
4241 """
4342 print (docopt .docopt (__doc__ ))
4443
45-
46- def parser () :
47- parser = argparse .ArgumentParser (description = "Large-scale metabolic reconstruction of bacterial genomes." )
48-
49- ## arguments
50- parser .add_argument ("-i" , "--input" , required = True , dest = "input" ,help = "Path to the folder where the genomes are" )
51- parser .add_argument ("-o" , "--output" , required = True , dest = "output" ,help = "Path to the folder where you want to put the results in" )
52- parser .add_argument ("--tax" , required = True , dest = "taxfile" ,help = "Path to the taxon file (.tsv)" )
53- parser .add_argument ("--padmet_ref" , required = True , dest = "path_to_padmet_ref" , help = "Path to the reference database in Padmet format." )
54- parser .add_argument ("--ptsc" , required = True , dest = "ptsc" , help = "Path to the root folder (for construction of Singularity bridge, necessary to access distant files)." )
55- parser .add_argument ("--ptsi" , required = True , dest = "ptsi" , help = "Path to the singularity image of mpwt to use." )
44+ def padmet_stats_cli (command_args ):
45+ args = docopt .docopt (__doc__ , argv = command_args )
5646
57- ## options
58- parser .add_argument ("--annot" , dest = "annot" , default = "prokka" , help = "Annotation tool(s) to use between 'prokka' (default), 'eggnog' and 'bakta'. If several annotation tools to use, write them comma-separated." )
59- parser .add_argument ("--egg_path" ,dest = "egg_path" ,help = "Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool." )
60- parser .add_argument ("--bak_path" ,dest = "bak_path" ,help = "Path to the Bakta database, mandatory if you want to use bakta as annotation tool." )
61- parser .add_argument ("-c" ,"--cpus" , dest = "cpus" , default = 2 , help = "Give the number of available CPUs" )
62- parser .add_argument ("-k" ,"--keep" , dest = "to_keep" , default = "" , help = """Give the file formats to keep - comma-separated list, '.' included""" )
63-
64- return parser .parse_args ()
47+ metabolic_reconstruction (args )
6548
6649
6750def my_basename (file ):
@@ -170,7 +153,7 @@ def mkdir(path) :
170153## KEY-FUNCTIONS -----------------------------------------------------------------------------
171154
172155
173- def bakta_annotation (input_dir , output_path , options ):
156+ def bakta_annotation (input_dir , output_path , args ):
174157 """
175158 Bakta annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
176159 Inputs :
@@ -181,7 +164,7 @@ def bakta_annotation(input_dir, output_path, options):
181164 processed (list) : list of processed genomes' names
182165 """
183166 print ("Bakta annotation launched.\n " )
184- path_to_bak = options . bak_path
167+ path_to_bak = args [ "-- bak_path" ]
185168 mkdir (os .path .join (output_path , 'bakta' ))
186169 processed = pd .DataFrame (columns = ['genome' , "bakta" ])
187170
@@ -193,14 +176,14 @@ def bakta_annotation(input_dir, output_path, options):
193176 ## annotate genomes
194177 mkdir (output )
195178 fasta = (os .path .join (input_dir , genome_name , genome_name + ".fasta" ))
196- command = f"bakta --db { path_to_bak } { fasta } --output { output } --prefix { genome_name } --compliant --force --threads { options . cpus } "
179+ command = f"bakta --db { path_to_bak } { fasta } --output { output } --prefix { genome_name } --compliant --force --threads { args [ '-- cpus' ] } "
197180 bigprint (command )
198181 os .system (command )
199182 ## --compliant Force Genbank/ENA/DDJB compliance
200183 ## --force Force overwriting existing output folder
201184
202185 ## removing unused files
203- unused_files = set ([".embl" , ".faa" , ".ffn" , ".fna" , ".gff3" , ".hypotheticals.faa" , ".hypotheticals.tsv" , ".json" , ".log" , ".png" , ".svg" , ".tsv" ]) - set (options . to_keep .split ("," ))
186+ unused_files = set ([".embl" , ".faa" , ".ffn" , ".fna" , ".gff3" , ".hypotheticals.faa" , ".hypotheticals.tsv" , ".json" , ".log" , ".png" , ".svg" , ".tsv" ]) - set (args [ "-k" ] .split ("," ))
204187 for extension in unused_files :
205188 file_to_delete = os .path .join (output , genome_name + extension )
206189 if os .path .exists (file_to_delete ) :
@@ -215,7 +198,7 @@ def bakta_annotation(input_dir, output_path, options):
215198 return processed
216199
217200
218- def prokka_annotation (input_dir , output_path , options ) :
201+ def prokka_annotation (input_dir , output_path , args ) :
219202 """
220203 Prokka annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
221204 Inputs :
@@ -236,13 +219,13 @@ def prokka_annotation(input_dir, output_path, options) :
236219 ## launch annotation
237220 fasta = os .path .join (input_dir , genome_name , f'{ genome_name } .fasta' )
238221 outdir = os .path .join (output_path , "prokka" , genome_name )
239- command_pro = f"prokka { fasta } --outdir { outdir } --prefix { genome_name } --compliant --force --cpus { options . cpus } "
222+ command_pro = f"prokka { fasta } --outdir { outdir } --prefix { genome_name } --compliant --force --cpus { args [ '-- cpus' ] } "
240223 ## --compliant Force Genbank/ENA/DDJB compliance
241224 bigprint (command_pro )
242225 os .system (command_pro )
243226
244227 ## removing unused files
245- unused_files = set ([".ecn" , ".err" , ".ffn" , ".fixed*" , ".fsa" , ".gff" , ".log" , ".sqn" , ".tbl" , ".val" , ".faa " ]) - set (options . to_keep .split ("," ))
228+ unused_files = set ([".ecn" , ".err" , ".ffn" , ".fixed*" , ".fsa" , ".gff" , ".log" , ".sqn" , ".tbl" , ".val" , ".faa " ]) - set (args [ "-k" ] .split ("," ))
246229 for extension in unused_files :
247230 file_to_delete = f"{ prok_file } { extension } "
248231 if os .path .exists (file_to_delete ) :
@@ -257,7 +240,7 @@ def prokka_annotation(input_dir, output_path, options) :
257240 return processed
258241
259242
260- def eggnog_annotation (input_dir , output_path , options ):
243+ def eggnog_annotation (input_dir , output_path , args ):
261244 """
262245 EggNOG-mapper annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
263246 Inputs :
@@ -268,7 +251,7 @@ def eggnog_annotation(input_dir, output_path, options):
268251 processed (list) : list of processed genomes' names
269252 """
270253 print ("Eggnog annotation launched.\n " )
271- path_to_egg = options . egg_path
254+ path_to_egg = args [ "-- egg_path" ]
272255 mkdir (os .path .join (output_path , 'eggnog' ))
273256 processed = pd .DataFrame (columns = ['genome' , "eggnog" ])
274257
@@ -280,15 +263,15 @@ def eggnog_annotation(input_dir, output_path, options):
280263 ## annotation
281264 mkdir (output_eggnog )
282265 genome = os .path .join (input_dir , genome_name , genome_name + ".fasta" )
283- command_egg = f"emapper.py -i { genome } -o { genome_name } --cpu { options . cpus } --itype genome --data_dir { path_to_egg } --output_dir { output_eggnog } --dbmem --genepred prodigal --override"
266+ command_egg = f"emapper.py -i { genome } -o { genome_name } --cpu { args [ '-- cpus' ] } --itype genome --data_dir { path_to_egg } --output_dir { output_eggnog } --dbmem --genepred prodigal --override"
284267 bigprint (command_egg )
285268 os .system (command_egg )
286269
287270 ## conversion of eggnog output to gbk
288271 prot = os .path .join (output_eggnog , genome_name + ".emapper.genepred.fasta" )
289272 gff = os .path .join (output_eggnog , genome_name + ".emapper.genepred.gff" )
290273 annot = os .path .join (output_eggnog , genome_name + ".emapper.annotations" )
291- command_egg2gbk = f' emapper2gbk genomes -fn { genome } -fp { prot } -g { gff } -a { annot } -o { out_file } -gt eggnog -c { options . cpus } '
274+ command_egg2gbk = f" emapper2gbk genomes -fn { genome } -fp { prot } -g { gff } -a { annot } -o { out_file } -gt eggnog -c { args [ '-- cpus' ] } "
292275 bigprint (command_egg2gbk )
293276 os .system (command_egg2gbk )
294277
@@ -299,7 +282,7 @@ def eggnog_annotation(input_dir, output_path, options):
299282 return processed
300283
301284
302- def create_taxon_file (annotation , genomes , options ):
285+ def create_taxon_file (annotation , genomes , output_path , args ):
303286 """
304287 From taxon file, generate another version of taxon file
305288 interpretable for mpwt in each annotation tool directory
@@ -310,8 +293,8 @@ def create_taxon_file(annotation, genomes, options):
310293 Output :
311294 a taxfile per annotool's directory
312295 """
313- output_path = options . output
314- taxfile = options . taxfile
296+
297+ taxfile = args [ "-- taxfile" ]
315298 genomes = genomes .to_list ()
316299
317300 df_taxons = pd .read_csv (taxfile , sep = '\t ' )
@@ -330,7 +313,7 @@ def create_taxon_file(annotation, genomes, options):
330313 tax_file = os .path .join (output_path , annotool , 'taxon_id.tsv' )
331314 df_to_write .to_csv (tax_file , sep = "\t " , index = False )
332315
333- def run_mpwt (output_path , annotation , genomes_names , options ):
316+ def run_mpwt (output_path , annotation , genomes_names , args ):
334317 """
335318 Run mpwt on GBK files to generate PGDBs
336319 Inputs :
@@ -339,8 +322,8 @@ def run_mpwt(output_path, annotation, genomes_names, options):
339322 genomes_names (list) : list of genomes names to iterate on
340323 options (parser) : arguments from parser
341324 """
342- path_to_scratch = options . ptsc
343- path_to_singularity = options . ptsi
325+ path_to_scratch = args [ "-- ptsc" ]
326+ path_to_singularity = args [ "-- ptsi" ]
344327 mkdir (os .path .join (output_path , 'mpwt' ))
345328
346329 for annotool in annotation :
@@ -353,7 +336,7 @@ def run_mpwt(output_path, annotation, genomes_names, options):
353336 print (f"Mpwt on { annotool } : { len (dat_dirs )} mpwt repositories found out of { len (genomes_names )} genomes to process" )
354337
355338 if len (dat_dirs ) != len (genomes_names ):
356- command_mpwt = f"singularity exec -B { path_to_scratch } :{ path_to_scratch } { path_to_scratch } { path_to_singularity } mpwt -f { output_path } { annotool } / -o { annotool_outdir } --cpu { options . cpus } --patho --flat --clean --md -v"
339+ command_mpwt = f"singularity exec -B { path_to_scratch } :{ path_to_scratch } { path_to_scratch } { path_to_singularity } mpwt -f { output_path } { annotool } / -o { annotool_outdir } --cpu { args [ '-- cpus' ] } --patho --flat --clean --md -v"
357340 ## --patho : Launch PathoLogic inference on input folder
358341 ## --flat : Create BioPAX/attribute-value flat files
359342 ## --clean : Delete all PGDBs in ptools-local folder or only PGDB from input folder
@@ -423,7 +406,7 @@ def check_padmet(df_summary, path):
423406 return df_summary
424407
425408
426- def convert2padmet (output_path , annotation , genomes_names , options ):
409+ def convert2padmet (output_path , annotation , genomes_names , args ):
427410 """
428411 Convert PGDBs in several .dat files into one strain-specific padmet file
429412 Inputs :
@@ -432,9 +415,9 @@ def convert2padmet(output_path, annotation, genomes_names, options):
432415 genomes_names (list) : list of genomes names to iterate on
433416 options (parser) : arguments from parser
434417 """
435- path_to_padmet_ref = options . path_to_padmet_ref
436- path_to_scratch = options . ptsc
437- path_to_singularity = options . ptsi
418+ path_to_padmet_ref = args [ "-- path_to_padmet_ref" ]
419+ path_to_scratch = args [ "-- ptsc" ]
420+ path_to_singularity = args [ "-- ptsi" ]
438421 padmet_output = output_path + 'padmet'
439422 mkdir (padmet_output )
440423
@@ -454,7 +437,7 @@ def convert2padmet(output_path, annotation, genomes_names, options):
454437 bigprint (command_pgdb2padmet_source )
455438 os .system (command_pgdb2padmet_source )
456439
457- def merge_padmet (output_path , annotation , genomes_names , options , df_summary ) :
440+ def merge_padmet (output_path , annotation , genomes_names , args , df_summary ) :
458441 """
459442 Merge padmets of a same strain all together in one padmet file
460443 Inputs :
@@ -463,8 +446,8 @@ def merge_padmet(output_path, annotation, genomes_names, options, df_summary) :
463446 genomes_names (list) : list of genomes names to iterate on
464447 options (parser) : arguments from parser
465448 """
466- path_to_scratch = options . ptsc
467- path_to_singularity = options . ptsi
449+ path_to_scratch = args [ "-- ptsc" ]
450+ path_to_singularity = args [ "-- ptsi" ]
468451 padmet_output = os .path .join (output_path , 'padmet' )
469452 output_merged = os .path .join (output_path , 'merged_padmet' )
470453 mkdir (output_merged )
@@ -549,7 +532,7 @@ def rename(file) :
549532 return file
550533
551534
552- def check_taxfile (options ) :
535+ def check_taxfile (args ) :
553536 """
554537 Check function relying on taxon file. Rename every file name read in it by
555538 removing error-generating characters, check if the corresponding file exists
@@ -559,8 +542,9 @@ def check_taxfile(options) :
559542 - taxfile (str) : path of the taxa file
560543 - genomes (str) : path to genomes directory
561544 """
562- taxfile = options .taxfile
563- genomes = options .input
545+ genomes = args ["--input" ]
546+ taxfile = args ["--tax" ]
547+
564548 if not os .path .exists (taxfile ) :
565549 return False , f"ERROR : no file found at { taxfile } "
566550 try :
@@ -590,21 +574,20 @@ def check_taxfile(options) :
590574 return True , ""
591575
592576
593- def main ( ) :
577+ def metabolic_reconstruction ( args ) :
594578 ## parsing arguments
595- options = parser ()
596- input_dir = options .input
597- output_path = options .output
579+ input_dir = args ["--input" ]
580+ output_path = args ["--output" ]
598581
599582 ## Creating output directory
600583 mkdir (output_path )
601- annotation = options . annot . split ( "," )
584+ annotation = args [ "-- annot" ]
602585 df_summary = pd .DataFrame (columns = ["genome" ])
603586
604587 ## unzipping and renaming to fasta if needed
605588 check_gzipped_only (input_dir )
606589
607- tax_ok , message = check_taxfile (options )
590+ tax_ok , message = check_taxfile (args )
608591 if not tax_ok :
609592 print (message )
610593 exit (0 )
@@ -613,11 +596,11 @@ def main() :
613596 for annotool in annotation :
614597 start = time .time ()
615598 if annotool == 'prokka' :
616- genomes_processed = prokka_annotation (input_dir , output_path , options )
599+ genomes_processed = prokka_annotation (input_dir , output_path , args )
617600 elif annotool == 'eggnog' :
618- genomes_processed = eggnog_annotation (input_dir , output_path , options )
601+ genomes_processed = eggnog_annotation (input_dir , output_path , args )
619602 elif annotool == 'bakta' :
620- genomes_processed = bakta_annotation (input_dir , output_path , options )
603+ genomes_processed = bakta_annotation (input_dir , output_path , args )
621604 else :
622605 continue
623606 time_taken = time .time () - start
@@ -630,16 +613,16 @@ def main() :
630613 df_summary , genomes_names = check_files ("annot" , output_path , df_summary , annotation )
631614
632615 ## mpwt's metabolic network construction step
633- create_taxon_file (annotation , genomes_names , options )
616+ create_taxon_file (annotation , genomes_names , output_path , args )
634617 start = time .time ()
635- run_mpwt (output_path , annotation , genomes_names , options )
618+ run_mpwt (output_path , annotation , genomes_names , args )
636619 time_taken = time .time () - start
637620 print (f"INFO : Mpwt step took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
638621
639622 ## checking if mpwt ran correctly for all annotools, identify convertible genomes and convert them using padmet
640623 df_summary , genomes_names = check_files ("mpwt" , output_path , df_summary , annotation )
641624 start = time .time ()
642- convert2padmet (output_path , annotation , genomes_names , options )
625+ convert2padmet (output_path , annotation , genomes_names , args )
643626 time_taken = time .time () - start
644627 print (f"INFO : Conversion to padmet took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
645628
@@ -652,7 +635,7 @@ def main() :
652635
653636 ## merge padmets and save summary
654637 start = time .time ()
655- df_summary = merge_padmet (output_path , annotation , genomes_names , options , df_summary )
638+ df_summary = merge_padmet (output_path , annotation , genomes_names , args , df_summary )
656639 time_taken = time .time () - start
657640 print (f"INFO : Merging padmets step took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
658641
0 commit comments