Skip to content

Commit 1a2b07b

Browse files
committed
update to v1.3.0
1 parent 3b67e3c commit 1a2b07b

File tree

2 files changed

+34
-8
lines changed

2 files changed

+34
-8
lines changed

bin/cidder

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,9 @@ def cidder_main():
208208

209209
# process input
210210
if taxa_name != "None" and taxa_name != None:
211-
util.downloadGTDBGenomes(taxa_name, gtdb_release, outdir, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, automated_download=automate_flag)
211+
util.downloadGTDBGenomes(taxa_name, gtdb_release, outdir, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, automated_download=automate_flag, gunzip=True, threads=threads)
212212
if len(genomes) > 0:
213-
util.processInputGenomes(genomes, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag)
213+
util.processInputGenomes(genomes, all_genomes_listing_file, logObject, sanity_check=sanity_check_flag, allow_gzipped=False)
214214

215215
number_of_genomes = 0
216216
genome_name_to_path = {}

src/skDER/util.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ async def main():
6868
loop.run_until_complete(main())
6969
loop.close()
7070

71-
def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, logObject, sanity_check=False, automated_download=False):
71+
def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, logObject, sanity_check=False, automated_download=False, gunzip=False, threads=1):
7272
"""
7373
Download GTDB genomes from NCBI Genbank using ncbi-genome-download.
7474
**********************************************************
@@ -157,6 +157,7 @@ def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, lo
157157

158158
gf_listing_handle = open(genome_listing_file, 'a+')
159159
final_genome_count = 0
160+
gunzip_cmds = []
160161
for f in os.listdir(genomes_directory):
161162
genome_file = genomes_directory + f
162163
if not os.path.isfile(genome_file): continue
@@ -178,9 +179,27 @@ def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, lo
178179
polished_filename = url_file_to_polished_name[f]
179180
renamed_gfile = genomes_directory + polished_filename
180181
os.rename(genome_file, renamed_gfile)
182+
if gunzip:
183+
gunzip_cmds.append(['gunzip', renamed_gfile])
184+
renamed_gfile = renamed_gfile[:-3]
181185
gf_listing_handle.write(renamed_gfile + '\n')
182186
gf_listing_handle.close()
183-
187+
188+
if len(gunzip_cmds) > 0:
189+
try:
190+
msg = "Uncompressing %d genomic assemblies for gene calling!" % genome_count
191+
sys.stdout.write(msg + '\n')
192+
logObject.info(msg)
193+
p = multiprocessing.Pool(threads)
194+
for _ in tqdm.tqdm(p.imap_unordered(multiProcess, gunzip_cmds), total=len(gunzip_cmds)):
195+
pass
196+
p.close()
197+
except Exception as e:
198+
msg = "An error occurred during multiprocessing: %s" % str(e)
199+
sys.stderr.write(msg + '\n')
200+
logObject.info(msg)
201+
202+
184203
msg = 'Was able to download %d of %d genomes belonging to taxa "%s" in GTDB %s.' % (final_genome_count, genome_count, taxa_name, gtdb_release)
185204
sys.stdout.write(msg + '\n')
186205
logObject.info(msg)
@@ -288,7 +307,7 @@ def processInputProteomes(proteomes, combined_proteome_faa, genome_to_path, logO
288307
combined_proteome_handle.close()
289308
return proteome_name_to_path
290309

291-
def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=False):
310+
def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=False, allow_gzipped=True):
292311
gf_listing_handle = open(genome_listing_file, 'a+')
293312
for g in genomes:
294313
g_path = os.path.abspath(g)
@@ -298,7 +317,11 @@ def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=Fa
298317
suffix = genome_file.split('/')[-1].split('.')[-1].lower()
299318
if genome_file.endswith('.gz'):
300319
suffix = genome_file.split('/')[-1][:-3].split('.')[-1].lower()
301-
320+
if not allow_gzipped:
321+
msg = 'Warning: genome %s is gzipped. Skipping ...' % genome_file
322+
sys.stderr.write(msg + '\n')
323+
logObject.warning(msg)
324+
continue
302325
try:
303326
assert (suffix in ACCEPTED_FASTA_SUFFICES)
304327
except:
@@ -323,7 +346,11 @@ def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=Fa
323346
suffix = genome_file.split('/')[-1].split('.')[-1].lower()
324347
if genome_file.endswith('.gz'):
325348
suffix = genome_file.split('/')[-1][:-3].split('.')[-1].lower()
326-
349+
if not allow_gzipped:
350+
msg = 'Warning: genome %s is gzipped. Skipping ...' % genome_file
351+
sys.stderr.write(msg + '\n')
352+
logObject.warning(msg)
353+
continue
327354
try:
328355
assert (suffix in ACCEPTED_FASTA_SUFFICES)
329356
except:
@@ -393,7 +420,6 @@ def determineN50(genome_listing_file, outdir, logObject, threads=1):
393420
p = multiprocessing.Pool(threads)
394421
for _ in tqdm.tqdm(p.imap_unordered(compute_n50, n50_inputs), total=len(n50_inputs)):
395422
pass
396-
p.map(compute_n50, n50_inputs)
397423
p.close()
398424
except Exception as e:
399425
msg = "An error occurred during multiprocessing: %s" % str(e)

0 commit comments

Comments
 (0)