66 Large-scale metabolic reconstruction of bacterial genomes.
77
88 usage:
9- padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --tax FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [-k STR]
9+ padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --taxfile FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [--keep STR]
1010
1111 -h --help Show this help message and exit
12- -i --input=FOLDER Path to the folder where the genomes are
13- -o --output=FOLDER Path to the folder where you want to put the results in
14- --tax =FILE Path to the taxon file (.tsv)
12+ -i --input=STR Path to the folder where the genomes are
13+ -o --output=STR Path to the folder where you want to put the results in
14+ -t --taxfile =FILE Path to the taxon file (.tsv)
1515 --padmet_ref=FILE Path to the reference database in Padmet format.
1616 --ptsc=FOLDER Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
1717 --ptsi=FILE Path to the singularity image of mpwt to use.
@@ -41,9 +41,9 @@ def command_help():
4141 """
4242 print (docopt .docopt (__doc__ ))
4343
44- def padmet_stats_cli (command_args ):
44+ def metabolic_reconstruction_cli (command_args ):
4545 args = docopt .docopt (__doc__ , argv = command_args )
46-
46+
4747 metabolic_reconstruction (args )
4848
4949
@@ -167,6 +167,10 @@ def bakta_annotation(input_dir, output_path, args):
167167 path_to_bak = args ["--bak_path" ]
168168 mkdir (os .path .join (output_path , 'bakta' ))
169169 processed = pd .DataFrame (columns = ['genome' , "bakta" ])
170+ if (args ["--keep" ]) != None :
171+ to_keep = args ["--keep" ].split ("," )
172+ else :
173+ to_keep = []
170174
171175 for genome_name in os .listdir (input_dir ) :
172176 output = (os .path .join (output_path , "bakta" , genome_name ))
@@ -182,19 +186,21 @@ def bakta_annotation(input_dir, output_path, args):
182186 ## --compliant Force Genbank/ENA/DDJB compliance
183187 ## --force Force overwriting existing output folder
184188
185- ## removing unused files
186- unused_files = set ([".embl" , ".faa" , ".ffn" , ".fna" , ".gff3" , ".hypotheticals.faa" , ".hypotheticals.tsv" , ".json" , ".log" , ".png" , ".svg" , ".tsv" ]) - set (args ["-k" ].split ("," ))
187- for extension in unused_files :
188- file_to_delete = os .path .join (output , genome_name + extension )
189- if os .path .exists (file_to_delete ) :
190- os .remove (file_to_delete )
191-
192- ## rename and count processed genomes
193- if os .path .exists (os .path .join (output , genome_name + ".gbff" )):
194- move (os .path .join (output , genome_name + ".gbff" ), final_file )
195- if os .path .exists (final_file ):
196- processed .loc [len (processed )] = [genome_name , "OK" ]
189+ ## removing unused files
190+ unused_files = set ([".embl" , ".faa" , ".ffn" , ".fna" , ".gff3" , ".hypotheticals.faa" , ".hypotheticals.tsv" , ".json" , ".log" , ".png" , ".svg" , ".tsv" ]) - set (to_keep )
191+ for extension in unused_files :
192+ file_to_delete = os .path .join (output , genome_name + extension )
193+ if os .path .exists (file_to_delete ) :
194+ os .remove (file_to_delete )
195+
196+ ## rename processed genomes
197+ if os .path .exists (os .path .join (output , genome_name + ".gbff" )):
198+ move (os .path .join (output , genome_name + ".gbff" ), final_file )
197199
200+ ## count processed genomes
201+ if os .path .exists (final_file ):
202+ processed .loc [len (processed )] = [genome_name , "OK" ]
203+
198204 return processed
199205
200206
@@ -211,6 +217,10 @@ def prokka_annotation(input_dir, output_path, args) :
211217 print ("Prokka annotation launched.\n " )
212218 mkdir (os .path .join (output_path , 'prokka' ))
213219 processed = pd .DataFrame (columns = ['genome' , "prokka" ])
220+ if (args ["--keep" ]) != None :
221+ to_keep = args ["--keep" ].split ("," )
222+ else :
223+ to_keep = []
214224
215225 for genome_name in os .listdir (input_dir ) :
216226 prok_file = os .path .join (output_path , "prokka" , genome_name , genome_name )
@@ -224,18 +234,20 @@ def prokka_annotation(input_dir, output_path, args) :
224234 bigprint (command_pro )
225235 os .system (command_pro )
226236
227- ## removing unused files
228- unused_files = set ([".ecn" , ".err" , ".ffn" , ".fixed*" , ".fsa" , ".gff" , ".log" , ".sqn" , ".tbl" , ".val" , ".faa " ]) - set (args ["-k" ].split ("," ))
229- for extension in unused_files :
230- file_to_delete = f"{ prok_file } { extension } "
231- if os .path .exists (file_to_delete ) :
232- os .remove (file_to_delete )
233-
234- ## rename and count processed genomes
235- if os .path .exists (prok_file + ".gbf" ):
236- move (prok_file + ".gbf" ,prok_file + ".gbk" ) # rename .gbf to .gbk
237- if os .path .exists (prok_file + ".gbk" ):
238- processed .loc [len (processed )] = [genome_name , "OK" ]
237+ ## removing unused files
238+ unused_files = set ([".ecn" , ".err" , ".ffn" , ".fixed*" , ".fsa" , ".gff" , ".log" , ".sqn" , ".tbl" , ".val" , ".faa " ]) - set (to_keep )
239+ for extension in unused_files :
240+ file_to_delete = f"{ prok_file } { extension } "
241+ if os .path .exists (file_to_delete ) :
242+ os .remove (file_to_delete )
243+
244+ ## rename processed genomes
245+ if os .path .exists (prok_file + ".gbf" ):
246+ move (prok_file + ".gbf" ,prok_file + ".gbk" ) # rename .gbf to .gbk
247+
248+ ## count processed genomes
249+ if os .path .exists (prok_file + ".gbk" ):
250+ processed .loc [len (processed )] = [genome_name , "OK" ]
239251
240252 return processed
241253
@@ -275,9 +287,9 @@ def eggnog_annotation(input_dir, output_path, args):
275287 bigprint (command_egg2gbk )
276288 os .system (command_egg2gbk )
277289
278- ## rename and count processed genomes
279- if os .path .exists (out_file ):
280- processed .loc [len (processed )] = [genome_name , "OK" ]
290+ ## rename and count processed genomes
291+ if os .path .exists (out_file ):
292+ processed .loc [len (processed )] = [genome_name , "OK" ]
281293
282294 return processed
283295
@@ -326,25 +338,25 @@ def run_mpwt(output_path, annotation, genomes_names, args):
326338 path_to_singularity = args ["--ptsi" ]
327339 mkdir (os .path .join (output_path , 'mpwt' ))
328340
329- for annotool in annotation :
330- annotool_outdir = f"{ output_path } mpwt/{ annotool } /"
331- mkdir (annotool_outdir )
332-
341+ for annotool in annotation :
333342 ## checking if mpwt has successfully run before
334- path = os .path .join (output_path , "mpwt" , annotool )
335- dat_dirs = [d for d in os .listdir (path ) if os .path .isdir (os .path .join (path , d ))] ## lists subdirectories names ## filters for those which start with GCF
343+ input_dir = os .path .join (output_path , annotool )
344+ outdir = os .path .join (output_path , "mpwt" , annotool )
345+ mkdir (outdir )
346+
347+ dat_dirs = [d for d in os .listdir (outdir ) if os .path .isdir (os .path .join (outdir , d ))] ## lists subdirectories names ## filters for those which start with GCF
336348 print (f"Mpwt on { annotool } : { len (dat_dirs )} mpwt repositories found out of { len (genomes_names )} genomes to process" )
337349
338350 if len (dat_dirs ) != len (genomes_names ):
339- command_mpwt = f"singularity exec -B { path_to_scratch } :{ path_to_scratch } { path_to_scratch } { path_to_singularity } mpwt -f { output_path } { annotool } / -o { annotool_outdir } --cpu { args ['--cpus' ]} --patho --flat --clean --md -v"
351+ command_mpwt = f"singularity exec -B { path_to_scratch } :{ path_to_scratch } { path_to_scratch } { path_to_singularity } mpwt -f { input_dir } / -o { outdir } --cpu { args ['--cpus' ]} --patho --flat --clean --md -v"
340352 ## --patho : Launch PathoLogic inference on input folder
341353 ## --flat : Create BioPAX/attribute-value flat files
342354 ## --clean : Delete all PGDBs in ptools-local folder or only PGDB from input folder
343355 ## --md : Move the dat files into the output folder
344356 bigprint (command_mpwt )
345357 os .system (command_mpwt )
346358 else :
347- print (f" Nothing to process in { annotool_outdir } , moving on.\n " )
359+ print (f" Nothing to process in { outdir } , moving on.\n " )
348360
349361
350362def check_mpwt (df_summary , path ):
@@ -415,17 +427,16 @@ def convert2padmet(output_path, annotation, genomes_names, args):
415427 genomes_names (list) : list of genomes names to iterate on
416428 options (parser) : arguments from parser
417429 """
418- path_to_padmet_ref = args ["--path_to_padmet_ref " ]
430+ path_to_padmet_ref = args ["--padmet_ref " ]
419431 path_to_scratch = args ["--ptsc" ]
420432 path_to_singularity = args ["--ptsi" ]
421- padmet_output = output_path + 'padmet'
433+ padmet_output = os . path . join ( output_path , 'padmet' )
422434 mkdir (padmet_output )
423435
424436 for annotool in annotation :
425437 dat_files = os .listdir (os .path .join (output_path , "mpwt" , annotool ))
426- print (f"Checking before launching pgdb2padmet on { annotool } files : { len (dat_files )} files generated till now out of { len (genomes_names )} considered processable\n " )
438+ print (f"Checking before launching pgdb2padmet on { annotool } files : { len (dat_files )} files generated till now out of { len (genomes_names )} considered processable" )
427439 for genome_name in genomes_names :
428- print (f"testing if { os .path .join (padmet_output , genome_name , f'{ genome_name } _{ annotool } .padmet' )} exists before launching padmet conversion" )
429440 if missing_or_empty (os .path .join (padmet_output , genome_name , f"{ genome_name } _{ annotool } .padmet" )):
430441
431442 ## create files in commune directories for annotations of the same genome
@@ -543,7 +554,7 @@ def check_taxfile(args) :
543554 - genomes (str) : path to genomes directory
544555 """
545556 genomes = args ["--input" ]
546- taxfile = args ["--tax " ]
557+ taxfile = args ["--taxfile " ]
547558
548559 if not os .path .exists (taxfile ) :
549560 return False , f"ERROR : no file found at { taxfile } "
@@ -552,7 +563,7 @@ def check_taxfile(args) :
552563 cols = df_taxa .columns
553564 col_filename = cols [2 ]
554565 except :
555- return False , f"ERROR : Error reading file at the following path ; are you sure it's a tab-separated file ? \n { taxfile } "
566+ return False , f"ERROR : Error reading taxon file at the following path ; are you sure it's a tab-separated file ? \n { taxfile } "
556567
557568 for i , row in df_taxa .iterrows () :
558569 file = row [col_filename ]
@@ -581,7 +592,7 @@ def metabolic_reconstruction(args) :
581592
582593 ## Creating output directory
583594 mkdir (output_path )
584- annotation = args ["--annot" ]
595+ annotation = args ["--annot" ]. split ( "," )
585596 df_summary = pd .DataFrame (columns = ["genome" ])
586597
587598 ## unzipping and renaming to fasta if needed
@@ -604,7 +615,7 @@ def metabolic_reconstruction(args) :
604615 else :
605616 continue
606617 time_taken = time .time () - start
607- print (f"INFO : { annotool } annotation took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
618+ print (f"INFO : { annotool } annotation took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds\n " )
608619
609620 df_summary = df_summary .merge (genomes_processed , on = "genome" , how = "outer" )
610621
@@ -617,14 +628,14 @@ def metabolic_reconstruction(args) :
617628 start = time .time ()
618629 run_mpwt (output_path , annotation , genomes_names , args )
619630 time_taken = time .time () - start
620- print (f"INFO : Mpwt step took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
631+ print (f"\n INFO : Mpwt step took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
621632
622633 ## checking if mpwt ran correctly for all annotools, identify convertible genomes and convert them using padmet
623634 df_summary , genomes_names = check_files ("mpwt" , output_path , df_summary , annotation )
624635 start = time .time ()
625636 convert2padmet (output_path , annotation , genomes_names , args )
626637 time_taken = time .time () - start
627- print (f"INFO : Conversion to padmet took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
638+ print (f"\n INFO : Conversion to padmet took { time_taken // 3600 } hour(s) { (time_taken % 3600 ) // 60 } minute(s) { time_taken % 60 } seconds" )
628639
629640 ## checking if padmet ran correctly for all annotools, save progression
630641 df_summary , genomes_names = check_files ("padmet" , output_path , df_summary , annotation )
0 commit comments