Skip to content

Commit 5212b3c

Browse files
committed
switch to fstring for consistency
1 parent 4f02410 commit 5212b3c

File tree

2 files changed

+45
-47
lines changed

2 files changed

+45
-47
lines changed

src/melon/cli.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def cli(argv=sys.argv):
1212
'''
1313
Entry point for command line interface.
1414
'''
15-
parser = ArgumentParser(description='Melon: \
16-
metagenomic long-read-based taxonomic identification and quantification', add_help=False)
15+
parser = ArgumentParser(
16+
description='Melon: metagenomic long-read-based taxonomic identification and quantification', add_help=False)
1717
required = parser.add_argument_group('required arguments')
1818
optional = parser.add_argument_group('optional arguments')
1919
additional = parser.add_argument_group('additional arguments')
@@ -44,7 +44,7 @@ def cli(argv=sys.argv):
4444
metavar='INT',
4545
type=int,
4646
default=os.cpu_count(),
47-
help='Number of threads. [{}]'.format(os.cpu_count()))
47+
help=f'Number of threads. [{os.cpu_count()}]')
4848

4949
optional.add_argument(
5050
'-k',
@@ -122,11 +122,11 @@ def cli(argv=sys.argv):
122122
parser.add_argument('-h', '--help', action='help', help=SUPPRESS)
123123

124124
if len(argv) == 1:
125-
print("\
125+
print(f"\
126126
__ \n\
127127
__ _ ___ / /__ ___ \n\
128128
/ ' \/ -_) / _ \/ _ \ \n\
129-
/_/_/_/\__/_/\___/_//_/ ver. {}\n".format(__version__))
129+
/_/_/_/\__/_/\___/_//_/ ver. {__version__}\n")
130130

131131
opt = parser.parse_args(argv[1:])
132132
run(opt)
@@ -140,49 +140,48 @@ def run(opt):
140140
if not os.path.isdir(opt.output):
141141
os.makedirs(opt.output, exist_ok=True)
142142
else:
143-
logger.warning('Folder <{}> exists. Files will be overwritten.'.format(opt.output))
143+
logger.warning(f'Folder <{opt.output}> exists. Files will be overwritten.')
144144

145145
## check for input files
146146
for file in opt.FILE:
147147
if not os.path.isfile(file):
148-
logger.critical('File <{}> does not exist.'.format(file))
148+
logger.critical(f'File <{file}> does not exist.')
149149
sys.exit(2)
150150

151151
## check for database
152152
if not os.path.isdir(opt.db):
153-
logger.critical('Database folder <{}> does not exist.'.format(opt.db))
153+
logger.critical(f'Database folder <{opt.db}> does not exist.')
154154
sys.exit(2)
155155
else:
156156
files = [os.path.basename(file) for file in glob.glob(os.path.join(opt.db, '*'))]
157157
if (
158158
'metadata.tsv' not in files or 'prot.dmnd' not in files or
159159
len([file for file in files if 'nucl' in file and '.mmi' in file]) != 16
160160
):
161-
logger.critical('Database <{}> is not complete or not indexed.'.format(opt.db))
161+
logger.critical(f'Database <{opt.db}> is not complete or not indexed.')
162162
sys.exit(2)
163163

164164
## check for kraken2 database
165165
if opt.db_kraken is not None:
166166
if not os.path.isdir(opt.db_kraken):
167-
logger.critical('Kraken2 database folder <{}> does not exist.'.format(opt.db_kraken))
167+
logger.critical(f'Kraken2 database folder <{opt.db_kraken}> does not exist.')
168168
sys.exit(2)
169169
else:
170170
files = [os.path.basename(file) for file in glob.glob(os.path.join(opt.db_kraken, '*'))]
171171
if 'ktaxonomy.tsv' not in files or len([file for file in files if 'database' in file]) != 7:
172-
logger.critical('Kraken2 database <{}> is not complete.'.format(opt.db_kraken))
172+
logger.critical(f'Kraken2 database <{opt.db_kraken}> is not complete.')
173173
sys.exit(2)
174174

175175
## check for logical cores
176176
if opt.threads > os.cpu_count():
177-
logger.warning(
178-
'Threads <{}> exceeds available logical cores, will use <{}> instead.'.format(opt.threads, os.cpu_count()))
177+
logger.warning(f'Threads <{opt.threads}> exceeds available logical cores, will use <{os.cpu_count()}> instead.')
179178
opt.threads = os.cpu_count()
180179
os.environ['OMP_NUM_THREADS'] = str(opt.threads)
181180

182181
## run
183182
for index, file in enumerate(opt.FILE):
184183
if len(opt.FILE) > 1:
185-
logger.info('Processing file <{}> ({}/{}) ...'.format(file, index + 1, len(opt.FILE)))
184+
logger.info(f'Processing file <{file}> ({index + 1}/{len(opt.FILE)}) ...')
186185

187186
GenomeProfiler(file, opt.db, opt.output, opt.threads).run(
188187
db_kraken=opt.db_kraken, skip_profile=opt.skip_profile, skip_clean=opt.skip_clean,

src/melon/melon.py

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def __init__(self, file, db, output, threads=os.cpu_count()):
1818
self.db = db
1919
self.output = output
2020
self.threads = threads
21+
self.outfile = get_filename(self.file, self.output)
2122

2223
self.aset = {'l2', 'l11', 'l10e', 'l15e', 'l18e', 's3ae', 's19e', 's28e'}
2324
self.bset = {'l2', 'l11', 'l20', 'l27', 's2', 's7', 's9', 's16'}
@@ -41,8 +42,8 @@ def run_kraken(self, db_kraken):
4142
subprocess.run([
4243
'kraken2',
4344
'--db', db_kraken,
44-
'--report', get_filename(self.file, self.output, '.kraken.report.tmp'),
45-
'--output', get_filename(self.file, self.output, '.kraken.output.tmp'),
45+
'--report', f'{self.outfile}.kraken.report.tmp',
46+
'--output', f'{self.outfile}.kraken.output.tmp',
4647
'--threads', str(self.threads),
4748
self.file,
4849
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
@@ -55,7 +56,7 @@ def parse_kraken(self):
5556
'''
5657
qseqids, taxids = set(), set()
5758
record = False
58-
with open(get_filename(self.file, self.output, '.kraken.report.tmp')) as f:
59+
with open(f'{self.outfile}.kraken.report.tmp') as f:
5960
for line in f:
6061
ls = line.rstrip().split('\t')
6162
if ls[3] in {'D', 'R1'}:
@@ -64,7 +65,7 @@ def parse_kraken(self):
6465
if record:
6566
taxids.add(ls[4])
6667

67-
with open(get_filename(self.file, self.output, '.kraken.output.tmp')) as f:
68+
with open(f'{self.outfile}.kraken.output.tmp') as f:
6869
for line in f:
6970
ls = line.rstrip().split('\t')
7071
if ls[2] in taxids:
@@ -81,7 +82,7 @@ def run_diamond(self, max_target_seqs=25, evalue=1e-15, identity=0, subject_cove
8182
'diamond', 'blastx',
8283
'--db', os.path.join(self.db, 'prot.dmnd'),
8384
'--query', self.file,
84-
'--out', get_filename(self.file, self.output, '.diamond.tmp'),
85+
'--out', f'{self.outfile}.diamond.tmp',
8586
'--outfmt', '6', *outfmt,
8687
'--evalue', str(evalue), '--subject-cover', str(subject_cover), '--id', str(identity),
8788
'--range-culling', '-F', '15', '--range-cover', '25',
@@ -95,7 +96,7 @@ def parse_diamond(self):
9596
'''
9697
qcoords = defaultdict(set)
9798

98-
with open(get_filename(self.file, self.output, '.diamond.tmp')) as f:
99+
with open(f'{self.outfile}.diamond.tmp') as f:
99100
for line in f:
100101
ls = line.rstrip().split('\t')
101102
qseqid, sseqid = ls[0], ls[1]
@@ -117,27 +118,27 @@ def parse_diamond(self):
117118
):
118119
## append qseqid and coordinates for back-tracing
119120
self.hits.append([qseqid, kingdom, family, qstart, qend])
120-
self.copies[sseqid.split('-')[-2]] += 0.125
121+
self.copies[kingdom] += 0.125
121122

122123
def run_minimap(self, secondary_num=2147483647, secondary_ratio=0.9):
123124
'''
124125
Run minimap2 to get taxonomic profiles.
125126
'''
126-
with open(get_filename(self.file, self.output, '.sequence.tmp'), 'w') as w:
127+
with open(f'{self.outfile}.sequence.tmp', 'w') as w:
127128
w.write(extract_sequences(self.file, {hit[0] for hit in self.hits}))
128129

129130
## consider each kingdom + family separately
130131
qseqids = defaultdict(set)
131132
for hit in self.hits:
132133
qseqids[hit[1] + '.' + hit[2].replace('/', '_')].add(hit[0])
133134

134-
with open(get_filename(self.file, self.output, '.minimap.tmp'), 'w') as w:
135+
with open(f'{self.outfile}.minimap.tmp', 'w') as w:
135136
with logging_redirect_tqdm():
136137
queue = tqdm(qseqids.items(), leave=False)
137138
for family, qseqid in queue:
138139
queue.set_description(f'==> Processing <{family}>')
139140

140-
sequences = extract_sequences(get_filename(self.file, self.output, '.sequence.tmp'), qseqid)
141+
sequences = extract_sequences(f'{self.outfile}.sequence.tmp', qseqid)
141142
subprocess.run([
142143
'minimap2',
143144
'-cx', 'map-ont',
@@ -238,9 +239,9 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
238239

239240
## return assignments
240241
assignments = []
241-
for row in p_reads.tocsr():
242-
row = row.toarray().squeeze()
243-
assignments.append(np.where(row == row.max())[0].tolist())
242+
for p_read in p_reads.tocsr():
243+
p_read = p_read.toarray().squeeze()
244+
assignments.append(np.where(p_read == p_read.max())[0].tolist())
244245

245246
ties = defaultdict(set)
246247
for qseqid, lineage in enumerate(assignments):
@@ -252,16 +253,16 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
252253
## resolve ties for equal probability cases using AS, MS and ID
253254
if ties:
254255
qset = set.union(*(set(qseqid) for qseqid in ties.values()))
255-
data = [row for row in data if row[0] in qset]
256+
alignments = [alignment for alignment in self.alignments if alignment[0] in qset]
256257

257258
for lineages, qseqids in ties.items():
258-
target = [row for row in data if row[0] in qseqids and row[-1] in lineages]
259+
targets = [alignment for alignment in alignments if alignment[0] in qseqids and alignment[-1] in lineages]
259260

260261
scores = defaultdict(lambda: defaultdict(list))
261-
for row in target:
262-
scores[row[-1]]['AS'].append(row[2])
263-
scores[row[-1]]['DE'].append(row[3])
264-
scores[row[-1]]['ID'].append(row[4])
262+
for target in targets:
263+
scores[target[-1]]['AS'].append(target[2])
264+
scores[target[-1]]['DE'].append(target[3])
265+
scores[target[-1]]['ID'].append(target[4])
265266

266267
## if all the same, choose the one with known species name
267268
target = sorted([
@@ -277,7 +278,7 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
277278
for qseqid in qseqids:
278279
self.assignments[qseqid] = target
279280

280-
def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
281+
def run(self, debug=False, db_kraken=None, skip_profile=False, skip_clean=False,
281282
max_target_seqs=25, evalue=1e-15, identity=0, subject_cover=75,
282283
secondary_num=2147483647, secondary_ratio=0.9,
283284
max_iteration=1000, epsilon=1e-10):
@@ -286,23 +287,22 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
286287
'''
287288
if db_kraken is not None:
288289
logger.info('Filtering reads ...')
289-
self.run_kraken(db_kraken)
290+
if not debug: self.run_kraken(db_krake=db_kraken)
290291
self.parse_kraken()
291-
logger.info('... removed {} putatively non-prokaryotic reads.'.format(len(self.nset)))
292+
logger.info(f'... removed {len(self.nset)} putatively non-prokaryotic reads.')
292293

293294
logger.info('Estimating genome copies ...')
294-
self.run_diamond(max_target_seqs, evalue, identity, subject_cover)
295+
if not debug: self.run_diamond(max_target_seqs=max_target_seqs, evalue=evalue, identity=identity, subject_cover=subject_cover)
295296
self.parse_diamond()
296-
logger.info('... found {} copies of genomes (bacteria: {}; archaea: {}).'.format(
297-
sum(self.copies.values()), self.copies['bacteria'], self.copies['archaea']))
297+
logger.info(f"... found {sum(self.copies.values())} copies of genomes (bacteria: {self.copies['bacteria']}; archaea: {self.copies['archaea']}).")
298298

299299
if not skip_profile:
300300
logger.info('Assigning taxonomy ...')
301-
self.run_minimap(secondary_num, secondary_ratio)
301+
if not debug: self.run_minimap(secondary_num=secondary_num, secondary_ratio=secondary_ratio)
302302

303303
logger.info('Reassigning taxonomy ...')
304304
self.parse_minimap()
305-
self.postprocess(max_iteration, epsilon)
305+
self.postprocess(max_iteration=max_iteration, epsilon=epsilon)
306306

307307
## fill missing ones according to hits
308308
replacements = {
@@ -336,24 +336,23 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
336336
], key=lambda row: (row[8], row[10], row[7]))
337337

338338
richness = {'bacteria': 0, 'archaea': 0}
339-
with open(get_filename(self.file, self.output, '.tsv'), 'w') as w:
339+
with open(f'{self.outfile}.tsv', 'w') as w:
340340
w.write('\t'.join(self.ranks + ['copy', 'abundance', 'identity']) + '\n')
341341

342342
for row in self.profile:
343343
if not re.search('unclassified (Bacteria|Archaea) species', row[6]):
344344
richness[row[0].split('|')[-1].lower()] += 1
345345
w.write('\t'.join(row[:7] + [f'{row[7]:.3f}', f'{row[8]:e}', f'{row[9]:.4f}/{row[10]:.4f}']) + '\n')
346346

347-
logger.info('... found {} unique species (bacteria: {}; archaea: {}).'.format(
348-
sum(richness.values()), richness['bacteria'], richness['archaea']))
347+
logger.info(f"... found {sum(richness.values())} unique species (bacteria: {richness['bacteria']}; archaea: {richness['archaea']}).")
349348

350349
else:
351350
self.profile = sorted([
352351
['Bacteria', self.copies['bacteria'], self.copies['bacteria'] / sum(self.copies.values())],
353352
['Archaea', self.copies['archaea'], self.copies['archaea'] / sum(self.copies.values())]
354353
], key=lambda row: (row[-2], row[-3]))
355354

356-
with open(get_filename(self.file, self.output, '.tsv'), 'w') as w:
355+
with open(f'{self.outfile}.tsv', 'w') as w:
357356
w.write('\t'.join(['superkingdom', 'copy', 'abundance']) + '\n')
358357

359358
for row in self.profile:
@@ -369,10 +368,10 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
369368
reads.update({hit[0]: {
370369
'remark': 'marker-gene-containing', 'lineage': hit[1].capitalize()} for hit in self.hits})
371370

372-
with open(get_filename(self.file, self.output, '.json'), 'w') as w:
371+
with open(f'{self.outfile}.json', 'w') as w:
373372
json.dump(dict(sorted(reads.items())), w, indent=4)
374373

375374
## clean up
376375
if not skip_clean:
377-
for file in glob.glob(get_filename(self.file, self.output, '.*.tmp')):
376+
for file in glob.glob(f'{self.outfile}.*.tmp'):
378377
os.remove(file)

0 commit comments

Comments
 (0)