Skip to content

Commit 75609f9

Browse files
committed
adapting the workflow to padmet's parser
1 parent 5d7f9f5 commit 75609f9

File tree

1 file changed

+62
-51
lines changed

1 file changed

+62
-51
lines changed

padmet/utils/exploration/metabolic_reconstruction.py

Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
Large-scale metabolic reconstruction of bacterial genomes.
77
88
usage:
9-
padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --tax FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [-k STR]
9+
padmet metabolic_reconstruction [-h] -i FOLDER -o FOLDER --taxfile FILE --padmet_ref FILE --ptsc FOLDER --ptsi FILE [--annot STR] [--egg_path FOLDER] [--bak_path FOLDER] [-c INT] [--keep STR]
1010
1111
-h --help Show this help message and exit
12-
-i --input=FOLDER Path to the folder where the genomes are
13-
-o --output=FOLDER Path to the folder where you want to put the results in
14-
--tax=FILE Path to the taxon file (.tsv)
12+
-i --input=STR Path to the folder where the genomes are
13+
-o --output=STR Path to the folder where you want to put the results in
14+
-t --taxfile=FILE Path to the taxon file (.tsv)
1515
--padmet_ref=FILE Path to the reference database in Padmet format.
1616
--ptsc=FOLDER Path to the root folder (for construction of Singularity bridge, necessary to access distant files).
1717
--ptsi=FILE Path to the singularity image of mpwt to use.
@@ -41,9 +41,9 @@ def command_help():
4141
"""
4242
print(docopt.docopt(__doc__))
4343

44-
def padmet_stats_cli(command_args):
44+
def metabolic_reconstruction_cli(command_args):
4545
args = docopt.docopt(__doc__, argv=command_args)
46-
46+
4747
metabolic_reconstruction(args)
4848

4949

@@ -167,6 +167,10 @@ def bakta_annotation(input_dir, output_path, args):
167167
path_to_bak = args["--bak_path"]
168168
mkdir(os.path.join(output_path, 'bakta'))
169169
processed = pd.DataFrame(columns = ['genome', "bakta"])
170+
if (args["--keep"]) != None :
171+
to_keep = args["--keep"].split(",")
172+
else :
173+
to_keep = []
170174

171175
for genome_name in os.listdir(input_dir) :
172176
output = (os.path.join(output_path, "bakta", genome_name))
@@ -182,19 +186,21 @@ def bakta_annotation(input_dir, output_path, args):
182186
## --compliant Force Genbank/ENA/DDJB compliance
183187
## --force Force overwriting existing output folder
184188

185-
## removing unused files
186-
unused_files = set([".embl", ".faa", ".ffn", ".fna", ".gff3", ".hypotheticals.faa", ".hypotheticals.tsv", ".json", ".log", ".png", ".svg", ".tsv"]) - set(args["-k"].split(","))
187-
for extension in unused_files :
188-
file_to_delete = os.path.join(output, genome_name + extension)
189-
if os.path.exists(file_to_delete) :
190-
os.remove(file_to_delete)
191-
192-
## rename and count processed genomes
193-
if os.path.exists(os.path.join(output, genome_name + ".gbff")):
194-
move(os.path.join(output, genome_name + ".gbff"), final_file)
195-
if os.path.exists(final_file):
196-
processed.loc[len(processed)] = [genome_name, "OK"]
189+
## removing unused files
190+
unused_files = set([".embl", ".faa", ".ffn", ".fna", ".gff3", ".hypotheticals.faa", ".hypotheticals.tsv", ".json", ".log", ".png", ".svg", ".tsv"]) - set(to_keep)
191+
for extension in unused_files :
192+
file_to_delete = os.path.join(output, genome_name + extension)
193+
if os.path.exists(file_to_delete) :
194+
os.remove(file_to_delete)
195+
196+
## rename processed genomes
197+
if os.path.exists(os.path.join(output, genome_name + ".gbff")):
198+
move(os.path.join(output, genome_name + ".gbff"), final_file)
197199

200+
## count processed genomes
201+
if os.path.exists(final_file):
202+
processed.loc[len(processed)] = [genome_name, "OK"]
203+
198204
return processed
199205

200206

@@ -211,6 +217,10 @@ def prokka_annotation(input_dir, output_path, args) :
211217
print("Prokka annotation launched.\n")
212218
mkdir(os.path.join(output_path, 'prokka'))
213219
processed = pd.DataFrame(columns = ['genome', "prokka"])
220+
if (args["--keep"]) != None :
221+
to_keep = args["--keep"].split(",")
222+
else :
223+
to_keep = []
214224

215225
for genome_name in os.listdir(input_dir) :
216226
prok_file = os.path.join(output_path, "prokka", genome_name, genome_name)
@@ -224,18 +234,20 @@ def prokka_annotation(input_dir, output_path, args) :
224234
bigprint(command_pro)
225235
os.system(command_pro)
226236

227-
## removing unused files
228-
unused_files=set([".ecn", ".err", ".ffn", ".fixed*", ".fsa", ".gff", ".log", ".sqn", ".tbl", ".val", ".faa "]) - set(args["-k"].split(","))
229-
for extension in unused_files :
230-
file_to_delete = f"{prok_file}{extension}"
231-
if os.path.exists(file_to_delete) :
232-
os.remove(file_to_delete)
233-
234-
## rename and count processed genomes
235-
if os.path.exists(prok_file + ".gbf"):
236-
move(prok_file+".gbf",prok_file+".gbk") # rename .gbf to .gbk
237-
if os.path.exists(prok_file + ".gbk"):
238-
processed.loc[len(processed)] = [genome_name, "OK"]
237+
## removing unused files
238+
unused_files=set([".ecn", ".err", ".ffn", ".fixed*", ".fsa", ".gff", ".log", ".sqn", ".tbl", ".val", ".faa "]) - set(to_keep)
239+
for extension in unused_files :
240+
file_to_delete = f"{prok_file}{extension}"
241+
if os.path.exists(file_to_delete) :
242+
os.remove(file_to_delete)
243+
244+
## rename processed genomes
245+
if os.path.exists(prok_file + ".gbf"):
246+
move(prok_file+".gbf",prok_file+".gbk") # rename .gbf to .gbk
247+
248+
## count processed genomes
249+
if os.path.exists(prok_file + ".gbk"):
250+
processed.loc[len(processed)] = [genome_name, "OK"]
239251

240252
return processed
241253

@@ -275,9 +287,9 @@ def eggnog_annotation(input_dir, output_path, args):
275287
bigprint(command_egg2gbk)
276288
os.system(command_egg2gbk)
277289

278-
## rename and count processed genomes
279-
if os.path.exists(out_file):
280-
processed.loc[len(processed)] = [genome_name, "OK"]
290+
## rename and count processed genomes
291+
if os.path.exists(out_file):
292+
processed.loc[len(processed)] = [genome_name, "OK"]
281293

282294
return processed
283295

@@ -326,25 +338,25 @@ def run_mpwt(output_path, annotation, genomes_names, args):
326338
path_to_singularity = args["--ptsi"]
327339
mkdir(os.path.join(output_path, 'mpwt'))
328340

329-
for annotool in annotation :
330-
annotool_outdir = f"{output_path}mpwt/{annotool}/"
331-
mkdir(annotool_outdir)
332-
341+
for annotool in annotation :
333342
## checking if mpwt has successfully run before
334-
path = os.path.join(output_path, "mpwt", annotool)
335-
dat_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] ## lists subdirectories names ## filters for those which start with GCF
343+
input_dir = os.path.join(output_path, annotool)
344+
outdir = os.path.join(output_path, "mpwt", annotool)
345+
mkdir(outdir)
346+
347+
dat_dirs = [d for d in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, d))] ## lists subdirectories names ## filters for those which start with GCF
336348
print(f"Mpwt on {annotool} : {len(dat_dirs)} mpwt repositories found out of {len(genomes_names)} genomes to process")
337349

338350
if len(dat_dirs) != len(genomes_names):
339-
command_mpwt = f"singularity exec -B {path_to_scratch}:{path_to_scratch} {path_to_scratch}{path_to_singularity} mpwt -f {output_path}{annotool}/ -o {annotool_outdir} --cpu {args['--cpus']} --patho --flat --clean --md -v"
351+
command_mpwt = f"singularity exec -B {path_to_scratch}:{path_to_scratch} {path_to_scratch}{path_to_singularity} mpwt -f {input_dir}/ -o {outdir} --cpu {args['--cpus']} --patho --flat --clean --md -v"
340352
## --patho : Launch PathoLogic inference on input folder
341353
## --flat : Create BioPAX/attribute-value flat files
342354
## --clean : Delete all PGDBs in ptools-local folder or only PGDB from input folder
343355
## --md : Move the dat files into the output folder
344356
bigprint(command_mpwt)
345357
os.system(command_mpwt)
346358
else :
347-
print(f" Nothing to process in {annotool_outdir}, moving on.\n")
359+
print(f" Nothing to process in {outdir}, moving on.\n")
348360

349361

350362
def check_mpwt(df_summary, path):
@@ -415,17 +427,16 @@ def convert2padmet(output_path, annotation, genomes_names, args):
415427
genomes_names (list) : list of genomes names to iterate on
416428
options (parser) : arguments from parser
417429
"""
418-
path_to_padmet_ref= args["--path_to_padmet_ref"]
430+
path_to_padmet_ref= args["--padmet_ref"]
419431
path_to_scratch = args["--ptsc"]
420432
path_to_singularity = args["--ptsi"]
421-
padmet_output = output_path + 'padmet'
433+
padmet_output = os.path.join(output_path, 'padmet')
422434
mkdir(padmet_output)
423435

424436
for annotool in annotation :
425437
dat_files = os.listdir(os.path.join(output_path, "mpwt", annotool))
426-
print(f"Checking before launching pgdb2padmet on {annotool} files : {len(dat_files)} files generated till now out of {len(genomes_names)} considered processable\n")
438+
print(f"Checking before launching pgdb2padmet on {annotool} files : {len(dat_files)} files generated till now out of {len(genomes_names)} considered processable")
427439
for genome_name in genomes_names :
428-
print(f"testing if {os.path.join(padmet_output, genome_name, f'{genome_name}_{annotool}.padmet')} exists before launching padmet conversion" )
429440
if missing_or_empty(os.path.join(padmet_output, genome_name, f"{genome_name}_{annotool}.padmet")):
430441

431442
## create files in commune directories for annotations of the same genome
@@ -543,7 +554,7 @@ def check_taxfile(args) :
543554
- genomes (str) : path to genomes directory
544555
"""
545556
genomes = args["--input"]
546-
taxfile = args["--tax"]
557+
taxfile = args["--taxfile"]
547558

548559
if not os.path.exists(taxfile) :
549560
return False, f"ERROR : no file found at {taxfile}"
@@ -552,7 +563,7 @@ def check_taxfile(args) :
552563
cols = df_taxa.columns
553564
col_filename = cols[2]
554565
except :
555-
return False, f"ERROR : Error reading file at the following path ; are you sure it's a tab-separated file ? \n{taxfile}"
566+
return False, f"ERROR : Error reading taxon file at the following path ; are you sure it's a tab-separated file ? \n{taxfile}"
556567

557568
for i, row in df_taxa.iterrows() :
558569
file = row[col_filename]
@@ -581,7 +592,7 @@ def metabolic_reconstruction(args) :
581592

582593
## Creating output directory
583594
mkdir(output_path)
584-
annotation = args["--annot"]
595+
annotation = args["--annot"].split(",")
585596
df_summary = pd.DataFrame(columns = ["genome"])
586597

587598
## unzipping and renaming to fasta if needed
@@ -604,7 +615,7 @@ def metabolic_reconstruction(args) :
604615
else :
605616
continue
606617
time_taken = time.time() - start
607-
print(f"INFO : {annotool} annotation took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
618+
print(f"INFO : {annotool} annotation took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds\n")
608619

609620
df_summary = df_summary.merge(genomes_processed, on = "genome", how = "outer")
610621

@@ -617,14 +628,14 @@ def metabolic_reconstruction(args) :
617628
start = time.time()
618629
run_mpwt(output_path, annotation, genomes_names, args)
619630
time_taken = time.time() - start
620-
print(f"INFO : Mpwt step took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
631+
print(f"\nINFO : Mpwt step took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
621632

622633
## checking if mpwt ran correctly for all annotools, identify convertible genomes and convert them using padmet
623634
df_summary, genomes_names = check_files("mpwt", output_path, df_summary, annotation)
624635
start = time.time()
625636
convert2padmet(output_path, annotation, genomes_names, args)
626637
time_taken = time.time() - start
627-
print(f"INFO : Conversion to padmet took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
638+
print(f"\nINFO : Conversion to padmet took {time_taken // 3600} hour(s) {(time_taken % 3600) // 60} minute(s) {time_taken % 60} seconds")
628639

629640
## checking if padmet ran correctly for all annotools, save progression
630641
df_summary, genomes_names = check_files("padmet", output_path, df_summary, annotation)

0 commit comments

Comments
 (0)