Skip to content

Commit 91f649b

Browse files
committed
incorporating metabolic_reonstruction
1 parent c0c3077 commit 91f649b

File tree

3 files changed

+59
-77
lines changed

3 files changed

+59
-77
lines changed

padmet/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
along with padmet. If not, see <http://www.gnu.org/licenses/>.
1616
"""
1717

18-
__version__='5.0.3'
18+
__version__='5.1.3'

padmet/__main__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ def main(args=None):
112112
sys.exit(command + ' not a valid command.')
113113

114114
# Import the corresponding module
115-
print('.'+command, 'padmet.'+command_import_path)
116115
command_import = importlib.import_module('.'+command, 'padmet.'+command_import_path)
117116
if '-h' in command_args:
118117
# Return help for the command

padmet/utils/exploration/metabolic_reconstruction.py

Lines changed: 58 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,22 @@
44
"""
55
Description:
66
Large-scale metabolic reconstruction of bacterial genomes.
7-
8-
::
97
108
usage:
11-
padmet metabolic_reconstruction [-h] -i INPUT -o OUTPUT --tax TAXFILE --padmet_ref PATH_TO_PADMET_REF --ptsc PTSC --ptsi PTSI [--annot ANNOT] [--egg_path EGG_PATH] [--bak_path BAK_PATH] [-c CPUS] [-k TO_KEEP]
12-
-h, --help Show this help message and exit
13-
-i INPUT, --input INPUT Path to the folder where the genomes are
14-
-o OUTPUT, --output OUTPUT Path to the folder where you want to put the results in
15-
--tax TAXFILE Path to the taxon file (.tsv)
16-
--padmet_ref PATH_TO_PADMET_REF Path to the reference database in Padmet format.
17-
--ptsc PTSC Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
18-
--ptsi PTSI Path to the singularity image of mpwt to use.
19-
--annot ANNOT Annotation tool(s) to use between 'bakta' (default), 'eggnog' and 'prokka'. If several annotation tools to use, write them comma-separated.
20-
--egg_path EGG_PATH Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool.
21-
--bak_path BAK_PATH Path to the Bakta database, mandatory if you want to use bakta as annotation tool.
22-
-c CPUS, --cpus CPUS Give the number of available CPUs
23-
-k TO_KEEP, --keep TO_KEEP Give the file formats to keep - comma-separated list, '.' included
9+
padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --tax FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [-k STR]
10+
11+
-h --help Show this help message and exit
12+
-i --input=FOLDER Path to the folder where the genomes are
13+
-o --output=FOLDER Path to the folder where you want to put the results in
14+
--tax=FILE Path to the taxon file (.tsv)
15+
--padmet_ref=FILE Path to the reference database in Padmet format.
16+
--ptsc=FOLDER Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
17+
--ptsi=FILE Path to the singularity image of mpwt to use.
18+
--annot=STR Annotation tool(s) to use between 'bakta' (default), 'eggnog' and 'prokka'. If several annotation tools to use, write them comma-separated.
19+
--egg_path=FOLDER Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool.
20+
--bak_path=FOLDER Path to the Bakta database, mandatory if you want to use bakta as annotation tool.
21+
-c --cpus=INT Give the number of available CPUs
22+
-k --keep=STR Give the file formats to keep - comma-separated list, '.' included
2423
"""
2524

2625
import os
@@ -42,26 +41,10 @@ def command_help():
4241
"""
4342
print(docopt.docopt(__doc__))
4443

45-
46-
def parser() :
47-
parser = argparse.ArgumentParser(description="Large-scale metabolic reconstruction of bacterial genomes.")
48-
49-
## arguments
50-
parser.add_argument("-i", "--input", required=True, dest="input",help="Path to the folder where the genomes are")
51-
parser.add_argument("-o", "--output", required=True, dest="output",help="Path to the folder where you want to put the results in")
52-
parser.add_argument("--tax", required=True, dest="taxfile",help="Path to the taxon file (.tsv)")
53-
parser.add_argument("--padmet_ref", required=True, dest="path_to_padmet_ref", help="Path to the reference database in Padmet format.")
54-
parser.add_argument("--ptsc", required=True, dest="ptsc", help="Path to the root folder (for construction of Singularity bridge, necessary to access distant files).")
55-
parser.add_argument("--ptsi", required=True, dest="ptsi", help="Path to the singularity image of mpwt to use.")
44+
def padmet_stats_cli(command_args):
45+
args = docopt.docopt(__doc__, argv=command_args)
5646

57-
## options
58-
parser.add_argument("--annot", dest="annot", default="prokka", help="Annotation tool(s) to use between 'prokka' (default), 'eggnog' and 'bakta'. If several annotation tools to use, write them comma-separated.")
59-
parser.add_argument("--egg_path",dest="egg_path",help="Path to the Eggnog database, mandatory if you want to use eggnog as annotation tool.")
60-
parser.add_argument("--bak_path",dest="bak_path",help="Path to the Bakta database, mandatory if you want to use bakta as annotation tool.")
61-
parser.add_argument("-c","--cpus", dest="cpus", default=2, help="Give the number of available CPUs")
62-
parser.add_argument("-k","--keep", dest="to_keep", default="", help="""Give the file formats to keep - comma-separated list, '.' included""")
63-
64-
return parser.parse_args()
47+
metabolic_reconstruction(args)
6548

6649

6750
def my_basename(file):
@@ -170,7 +153,7 @@ def mkdir(path) :
170153
## KEY-FUNCTIONS -----------------------------------------------------------------------------
171154

172155

173-
def bakta_annotation(input_dir, output_path, options):
156+
def bakta_annotation(input_dir, output_path, args):
174157
"""
175158
Bakta annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
176159
Inputs :
@@ -181,7 +164,7 @@ def bakta_annotation(input_dir, output_path, options):
181164
processed (list) : list of processed genomes' names
182165
"""
183166
print("Bakta annotation launched.\n")
184-
path_to_bak = options.bak_path
167+
path_to_bak = args["--bak_path"]
185168
mkdir(os.path.join(output_path, 'bakta'))
186169
processed = pd.DataFrame(columns = ['genome', "bakta"])
187170

@@ -193,14 +176,14 @@ def bakta_annotation(input_dir, output_path, options):
193176
## annotate genomes
194177
mkdir(output)
195178
fasta = (os.path.join(input_dir, genome_name, genome_name + ".fasta"))
196-
command = f"bakta --db {path_to_bak} {fasta} --output {output} --prefix {genome_name} --compliant --force --threads {options.cpus}"
179+
command = f"bakta --db {path_to_bak} {fasta} --output {output} --prefix {genome_name} --compliant --force --threads {args['--cpus']}"
197180
bigprint(command)
198181
os.system(command)
199182
## --compliant Force Genbank/ENA/DDJB compliance
200183
## --force Force overwriting existing output folder
201184

202185
## removing unused files
203-
unused_files = set([".embl", ".faa", ".ffn", ".fna", ".gff3", ".hypotheticals.faa", ".hypotheticals.tsv", ".json", ".log", ".png", ".svg", ".tsv"]) - set(options.to_keep.split(","))
186+
unused_files = set([".embl", ".faa", ".ffn", ".fna", ".gff3", ".hypotheticals.faa", ".hypotheticals.tsv", ".json", ".log", ".png", ".svg", ".tsv"]) - set(args["-k"].split(","))
204187
for extension in unused_files :
205188
file_to_delete = os.path.join(output, genome_name + extension)
206189
if os.path.exists(file_to_delete) :
@@ -215,7 +198,7 @@ def bakta_annotation(input_dir, output_path, options):
215198
return processed
216199

217200

218-
def prokka_annotation(input_dir, output_path, options) :
201+
def prokka_annotation(input_dir, output_path, args) :
219202
"""
220203
Prokka annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
221204
Inputs :
@@ -236,13 +219,13 @@ def prokka_annotation(input_dir, output_path, options) :
236219
## launch annotation
237220
fasta = os.path.join(input_dir, genome_name, f'{genome_name}.fasta')
238221
outdir = os.path.join(output_path, "prokka", genome_name)
239-
command_pro = f"prokka {fasta} --outdir {outdir} --prefix {genome_name} --compliant --force --cpus {options.cpus}"
222+
command_pro = f"prokka {fasta} --outdir {outdir} --prefix {genome_name} --compliant --force --cpus {args['--cpus']}"
240223
## --compliant Force Genbank/ENA/DDJB compliance
241224
bigprint(command_pro)
242225
os.system(command_pro)
243226

244227
## removing unused files
245-
unused_files=set([".ecn", ".err", ".ffn", ".fixed*", ".fsa", ".gff", ".log", ".sqn", ".tbl", ".val", ".faa "]) - set(options.to_keep.split(","))
228+
unused_files=set([".ecn", ".err", ".ffn", ".fixed*", ".fsa", ".gff", ".log", ".sqn", ".tbl", ".val", ".faa "]) - set(args["-k"].split(","))
246229
for extension in unused_files :
247230
file_to_delete = f"{prok_file}{extension}"
248231
if os.path.exists(file_to_delete) :
@@ -257,7 +240,7 @@ def prokka_annotation(input_dir, output_path, options) :
257240
return processed
258241

259242

260-
def eggnog_annotation(input_dir, output_path, options):
243+
def eggnog_annotation(input_dir, output_path, args):
261244
"""
262245
EggNOG-mapper annotation step : from a fasta file, generate a GBK file of annotated genome. Iterated on all genomes
263246
Inputs :
@@ -268,7 +251,7 @@ def eggnog_annotation(input_dir, output_path, options):
268251
processed (list) : list of processed genomes' names
269252
"""
270253
print("Eggnog annotation launched.\n")
271-
path_to_egg = options.egg_path
254+
path_to_egg = args["--egg_path"]
272255
mkdir(os.path.join(output_path, 'eggnog'))
273256
processed = pd.DataFrame(columns = ['genome', "eggnog"])
274257

@@ -280,15 +263,15 @@ def eggnog_annotation(input_dir, output_path, options):
280263
## annotation
281264
mkdir(output_eggnog)
282265
genome = os.path.join(input_dir, genome_name, genome_name + ".fasta")
283-
command_egg = f"emapper.py -i {genome} -o {genome_name} --cpu {options.cpus} --itype genome --data_dir {path_to_egg} --output_dir {output_eggnog} --dbmem --genepred prodigal --override"
266+
command_egg = f"emapper.py -i {genome} -o {genome_name} --cpu {args['--cpus']} --itype genome --data_dir {path_to_egg} --output_dir {output_eggnog} --dbmem --genepred prodigal --override"
284267
bigprint(command_egg)
285268
os.system(command_egg)
286269

287270
## conversion of eggnog output to gbk
288271
prot = os.path.join(output_eggnog, genome_name + ".emapper.genepred.fasta")
289272
gff = os.path.join(output_eggnog, genome_name + ".emapper.genepred.gff")
290273
annot = os.path.join(output_eggnog, genome_name + ".emapper.annotations")
291-
command_egg2gbk = f'emapper2gbk genomes -fn {genome} -fp {prot} -g {gff} -a {annot} -o {out_file} -gt eggnog -c {options.cpus}'
274+
command_egg2gbk = f"emapper2gbk genomes -fn {genome} -fp {prot} -g {gff} -a {annot} -o {out_file} -gt eggnog -c {args['--cpus']}"
292275
bigprint(command_egg2gbk)
293276
os.system(command_egg2gbk)
294277

@@ -299,7 +282,7 @@ def eggnog_annotation(input_dir, output_path, options):
299282
return processed
300283

301284

302-
def create_taxon_file(annotation, genomes, options):
285+
def create_taxon_file(annotation, genomes, output_path, args):
303286
"""
304287
From taxon file, generate another version of taxon file
305288
interpretable for mpwt in each annotation tool directory
@@ -310,8 +293,8 @@ def create_taxon_file(annotation, genomes, options):
310293
Output :
311294
a taxfile per annotool's directory
312295
"""
313-
output_path = options.output
314-
taxfile = options.taxfile
296+
297+
taxfile = args["--taxfile"]
315298
genomes = genomes.to_list()
316299

317300
df_taxons = pd.read_csv(taxfile, sep='\t')
@@ -330,7 +313,7 @@ def create_taxon_file(annotation, genomes, options):
330313
tax_file = os.path.join(output_path, annotool, 'taxon_id.tsv')
331314
df_to_write.to_csv(tax_file, sep="\t", index=False)
332315

333-
def run_mpwt(output_path, annotation, genomes_names, options):
316+
def run_mpwt(output_path, annotation, genomes_names, args):
334317
"""
335318
Run mpwt on GBK files to generate PGDBs
336319
Inputs :
@@ -339,8 +322,8 @@ def run_mpwt(output_path, annotation, genomes_names, options):
339322
genomes_names (list) : list of genomes names to iterate on
340323
options (parser) : arguments from parser
341324
"""
342-
path_to_scratch = options.ptsc
343-
path_to_singularity = options.ptsi
325+
path_to_scratch = args["--ptsc"]
326+
path_to_singularity = args["--ptsi"]
344327
mkdir(os.path.join(output_path, 'mpwt'))
345328

346329
for annotool in annotation :
@@ -353,7 +336,7 @@ def run_mpwt(output_path, annotation, genomes_names, options):
353336
print(f"Mpwt on {annotool} : {len(dat_dirs)} mpwt repositories found out of {len(genomes_names)} genomes to process")
354337

355338
if len(dat_dirs) != len(genomes_names):
356-
command_mpwt = f"singularity exec -B {path_to_scratch}:{path_to_scratch} {path_to_scratch}{path_to_singularity} mpwt -f {output_path}{annotool}/ -o {annotool_outdir} --cpu {options.cpus} --patho --flat --clean --md -v"
339+
command_mpwt = f"singularity exec -B {path_to_scratch}:{path_to_scratch} {path_to_scratch}{path_to_singularity} mpwt -f {output_path}{annotool}/ -o {annotool_outdir} --cpu {args['--cpus']} --patho --flat --clean --md -v"
357340
## --patho : Launch PathoLogic inference on input folder
358341
## --flat : Create BioPAX/attribute-value flat files
359342
## --clean : Delete all PGDBs in ptools-local folder or only PGDB from input folder
@@ -423,7 +406,7 @@ def check_padmet(df_summary, path):
423406
return df_summary
424407

425408

426-
def convert2padmet(output_path, annotation, genomes_names, options):
409+
def convert2padmet(output_path, annotation, genomes_names, args):
427410
"""
428411
Convert PGDBs in several .dat files into one strain-specific padmet file
429412
Inputs :
@@ -432,9 +415,9 @@ def convert2padmet(output_path, annotation, genomes_names, options):
432415
genomes_names (list) : list of genomes names to iterate on
433416
options (parser) : arguments from parser
434417
"""
435-
path_to_padmet_ref= options.path_to_padmet_ref
436-
path_to_scratch = options.ptsc
437-
path_to_singularity = options.ptsi
418+
path_to_padmet_ref= args["--path_to_padmet_ref"]
419+
path_to_scratch = args["--ptsc"]
420+
path_to_singularity = args["--ptsi"]
438421
padmet_output = output_path + 'padmet'
439422
mkdir(padmet_output)
440423

@@ -454,7 +437,7 @@ def convert2padmet(output_path, annotation, genomes_names, options):
454437
bigprint(command_pgdb2padmet_source)
455438
os.system(command_pgdb2padmet_source)
456439

457-
def merge_padmet(output_path, annotation, genomes_names, options, df_summary) :
440+
def merge_padmet(output_path, annotation, genomes_names, args, df_summary) :
458441
"""
459442
Merge padmets of a same strain all together in one padmet file
460443
Inputs :
@@ -463,8 +446,8 @@ def merge_padmet(output_path, annotation, genomes_names, options, df_summary) :
463446
genomes_names (list) : list of genomes names to iterate on
464447
options (parser) : arguments from parser
465448
"""
466-
path_to_scratch = options.ptsc
467-
path_to_singularity = options.ptsi
449+
path_to_scratch = args["--ptsc"]
450+
path_to_singularity = args["--ptsi"]
468451
padmet_output = os.path.join(output_path, 'padmet')
469452
output_merged = os.path.join(output_path, 'merged_padmet')
470453
mkdir(output_merged)
@@ -549,7 +532,7 @@ def rename(file) :
549532
return file
550533

551534

552-
def check_taxfile(options) :
535+
def check_taxfile(args) :
553536
"""
554537
Check function relying on taxon file. Rename every file name read in it by
555538
removing error-generating characters, check if the corresponding file exists
@@ -559,8 +542,9 @@ def check_taxfile(options) :
559542
- taxfile (str) : path of the taxa file
560543
- genomes (str) : path to genomes directory
561544
"""
562-
taxfile = options.taxfile
563-
genomes = options.input
545+
genomes = args["--input"]
546+
taxfile = args["--tax"]
547+
564548
if not os.path.exists(taxfile) :
565549
return False, f"ERROR : no file found at {taxfile}"
566550
try :
@@ -590,21 +574,20 @@ def check_taxfile(options) :
590574
return True, ""
591575

592576

593-
def main() :
577+
def metabolic_reconstruction(args) :
594578
## parsing arguments
595-
options = parser()
596-
input_dir = options.input
597-
output_path = options.output
579+
input_dir = args["--input"]
580+
output_path = args["--output"]
598581

599582
## Creating output directory
600583
mkdir(output_path)
601-
annotation = options.annot.split(",")
584+
annotation = args["--annot"]
602585
df_summary = pd.DataFrame(columns = ["genome"])
603586

604587
## unzipping and renaming to fasta if needed
605588
check_gzipped_only(input_dir)
606589

607-
tax_ok, message = check_taxfile(options)
590+
tax_ok, message = check_taxfile(args)
608591
if not tax_ok :
609592
print(message)
610593
exit(0)
@@ -613,11 +596,11 @@ def main() :
613596
for annotool in annotation :
614597
start = time.time()
615598
if annotool == 'prokka' :
616-
genomes_processed = prokka_annotation(input_dir, output_path, options)
599+
genomes_processed = prokka_annotation(input_dir, output_path, args)
617600
elif annotool == 'eggnog' :
618-
genomes_processed = eggnog_annotation(input_dir, output_path, options)
601+
genomes_processed = eggnog_annotation(input_dir, output_path, args)
619602
elif annotool == 'bakta' :
620-
genomes_processed = bakta_annotation(input_dir, output_path, options)
603+
genomes_processed = bakta_annotation(input_dir, output_path, args)
621604
else :
622605
continue
623606
time_taken = time.time() - start
@@ -630,16 +613,16 @@ def main() :
630613
df_summary, genomes_names = check_files("annot", output_path, df_summary, annotation)
631614

632615
## mpwt's metabolic network construction step
633-
create_taxon_file(annotation, genomes_names, options)
616+
create_taxon_file(annotation, genomes_names, output_path, args)
634617
start = time.time()
635-
run_mpwt(output_path, annotation, genomes_names, options)
618+
run_mpwt(output_path, annotation, genomes_names, args)
636619
time_taken = time.time() - start
637620
print(f"INFO : Mpwt step took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
638621

639622
## checking if mpwt ran correctly for all annotools, identify convertible genomes and convert them using padmet
640623
df_summary, genomes_names = check_files("mpwt", output_path, df_summary, annotation)
641624
start = time.time()
642-
convert2padmet(output_path, annotation, genomes_names, options)
625+
convert2padmet(output_path, annotation, genomes_names, args)
643626
time_taken = time.time() - start
644627
print(f"INFO : Conversion to padmet took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
645628

@@ -652,7 +635,7 @@ def main() :
652635

653636
## merge padmets and save summary
654637
start = time.time()
655-
df_summary = merge_padmet(output_path, annotation, genomes_names, options, df_summary)
638+
df_summary = merge_padmet(output_path, annotation, genomes_names, args, df_summary)
656639
time_taken = time.time() - start
657640
print(f"INFO : Merging padmets step took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
658641

0 commit comments

Comments
 (0)