Skip to content

Commit 25c0961

Browse files
authored
Merge pull request #141 from borijoa/refine-merge
Add classifiedRefiner
2 parents b247126 + b1eaf9b commit 25c0961

File tree

9 files changed

+554
-20
lines changed

9 files changed

+554
-20
lines changed

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ Please cite: [Kim J, Steinegger M. Metabuli: sensitive and specific metagenomic
6464
- [Compile from source code](#compile-from-source-code)
6565
- [Pre-built databases](#pre-built-databases)
6666
- [Classification](#classification)
67+
- [Refine Classification file](#refine-classification-file)
6768
- [Extract](#extract)
6869
- [GTDB-based custom database](#gtdb-based-custom-database)
6970
- [Creat a new database](#creat-a-new-database)
@@ -228,6 +229,32 @@ It is for an interactive taxonomy report (Krona). You can use any modern web bro
228229
Metabuli can classify reads against a database of any size as long as the database is fits in the hard disk, regardless of the machine's RAM size.
229230
We tested it with a MacBook Air (2020, M1, 8 GiB), where we classified about 15 M paired-end 150 bp reads (~5 GiB in size) against a database built with ~23K prokaryotic genomes (~69 GiB in size).
230231

232+
---
233+
## Refine Classification file
234+
After running the 'classify' command, you can refine classification file with some options.
235+
236+
```
237+
metabuli classifiedRefiner <i:read-by-read classification> <i:DBDIR> [options]
238+
239+
- read-by-read classification : The JobID_classifications.tsv file generated by the `classify` step.
240+
- DBDIR : The same DBDIR used in the `classify` step.
241+
242+
* Options
243+
--threads : The number of threads to utilize (all by default)
244+
--remove-unclassified : Remove unclassified reads
245+
--exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685,1234)
246+
--select-taxid : Select list of taxids as well as its children (e.g., 1758,9685,1234)
247+
--select-columns : Select list of columns with number and handle full lineage as 7 (generated if absent) (e.g., 2,5,7,3)
248+
--report : Write report of refined classification file
249+
--rank : Adjust classification to the specified rank
250+
--rank-file-type : Choose how to handle reads assigned to higher taxonomic ranks when using the --rank option. [0: exclude higher rank, 1: include higher rank, 2: make separate file for higher rank classification]
251+
252+
```
253+
#### Output
254+
- refined classification file : `JobID_refined.tsv`
255+
- report : `JobID_refined_report.tsv`, `JobID_refined_krona.html`
256+
- higher rank classification file : `_refined_higherRanks.tsv`
257+
231258
---
232259
## Extract
233260
After running the `classify` command, you can extract reads that are classified under a specific taxon.

src/LocalCommandDeclarations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@ extern int taxdump(int argc, const char **argv, const Command& command);
2525
extern int accession2taxid(int argc, const char **argv, const Command& command);
2626
extern int editNames(int argc, const char **argv, const Command& command);
2727
extern int createnewtaxalist(int argc, const char **argv, const Command& command);
28+
extern int classifiedRefiner(int argc, const char **argv, const Command& command);
2829

2930
#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H

src/MetabuliBase.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,15 @@ std::vector<Command> metabuliCommands = {
181181
{"FASTA list", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
182182
{"new taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
183183
{"accession2taxid", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
184-
{"output prefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}
184+
{"output prefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
185+
{"classifiedRefiner", classifiedRefiner, &localPar.classifiedRefiner, COMMAND_FORMAT_CONVERSION,
186+
"Generates refined classification file from classified file",
187+
nullptr,
188+
"Siyoung Choi <remy0502@snu.ac.kr>",
189+
"<i: classified file> <i: taxonomy dump>",
190+
CITATION_SPACEPHARER,
191+
{{"classified file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
192+
{"taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}
185193
};
186194

187195
std::vector<KmerThreshold> externalThreshold = {};

src/commons/LocalParameters.cpp

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,8 @@ LocalParameters::LocalParameters() :
315315
"^[0-9]+$"),
316316
PRINT_COLUMNS(PRINT_COLUMNS_ID,
317317
"--print-columns",
318-
"CSV of column numbers to be printed",
319-
"CSV of column numbers to be printed",
318+
"CSV of columns to print",
319+
"CSV of columns to print",
320320
typeid(std::string),
321321
(void *) &printColumns,
322322
"^.*$"),
@@ -375,7 +375,59 @@ LocalParameters::LocalParameters() :
375375
"Last k-mer to print",
376376
typeid(size_t),
377377
(void *) &kmerEnd,
378-
"^[0-9]+$")
378+
"^[0-9]+$"),
379+
REMOVE_UNCLASSIFIED(REMOVE_UNCLASSIFIED_ID,
380+
"--remove-unclassified",
381+
"Remove unclassified reads",
382+
"Remove unclassified reads",
383+
typeid(bool),
384+
(void *) &removeUnclassified,
385+
""),
386+
EXCLUDE_TAXID(EXCLUDE_TAXID_ID,
387+
"--exclude-taxid",
388+
"Exclude taxId as well as its children",
389+
"Exclude taxId as well as its children",
390+
typeid(std::string),
391+
(void *) &excludeTaxid,
392+
"^.*$"),
393+
SELECT_TAXID(SELECT_TAXID_ID,
394+
"--select-taxid",
395+
"Select taxId as well as its children",
396+
"Select taxId as well as its children",
397+
typeid(std::string),
398+
(void *) &selectTaxid,
399+
"^.*$"),
400+
SELECT_COLUMNS(SELECT_COLUMNS_ID,
401+
"--select-columns",
402+
"Select columns with number, (7:full lineage, generated if absent)",
403+
"Select columns with number, (7:full lineage, generated if absent)",
404+
typeid(std::string),
405+
(void *) &selectColumns,
406+
"^.*$"),
407+
REPORT(REPORT_ID,
408+
"--report",
409+
"Make report of refined classification file",
410+
"Make report of refined classification file",
411+
typeid(bool),
412+
(void *) &report,
413+
""),
414+
RANK(RANK_ID,
415+
"--rank",
416+
"Adjust classification to the specified rank",
417+
"Adjust classification to the specified rank",
418+
typeid(std::string),
419+
(void *) &rank,
420+
"^.*$"),
421+
HIGHER_RANK_FILE(HIGHER_RANK_FILE_ID,
422+
"--rank-file-type",
423+
"0: without higher rank, 1: with higher rank, 2: separate file for higher rank classification",
424+
"0: without higher rank, 1: with higher rank, 2: separate file for higher rank classification",
425+
typeid(int),
426+
(void *) &higherRankFile,
427+
"^[0-2]$")
428+
429+
430+
379431
{
380432
// Initialize the parameters
381433
// Superkingdom taxonomy id
@@ -429,6 +481,17 @@ LocalParameters::LocalParameters() :
429481
printMode = 0;
430482
contamList = "";
431483

484+
// classified to full taxonomy
485+
removeUnclassified = false;
486+
excludeTaxid = "";
487+
selectTaxid = "";
488+
selectColumns = "";
489+
report = false;
490+
rank = "";
491+
higherRankFile = 0;
492+
493+
494+
432495

433496
// build
434497
build.push_back(&PARAM_THREADS);
@@ -550,6 +613,18 @@ LocalParameters::LocalParameters() :
550613
expand_diffidx.push_back(&KMER_END);
551614

552615
query2reference.push_back(&TEST_RANK);
616+
617+
//classified2full
618+
classifiedRefiner.push_back(&REMOVE_UNCLASSIFIED);
619+
classifiedRefiner.push_back(&EXCLUDE_TAXID);
620+
classifiedRefiner.push_back(&SELECT_TAXID);
621+
classifiedRefiner.push_back(&SELECT_COLUMNS);
622+
classifiedRefiner.push_back(&REPORT);
623+
classifiedRefiner.push_back(&RANK);
624+
classifiedRefiner.push_back(&HIGHER_RANK_FILE);
625+
classifiedRefiner.push_back(&PARAM_THREADS);
626+
627+
553628
}
554629

555630
void LocalParameters::printParameters(const std::string &module, int argc, const char* pargv[],

src/commons/LocalParameters.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ class LocalParameters : public Parameters {
4242
std::vector<MMseqsParameter*> accession2taxid;
4343
std::vector<MMseqsParameter*> editNames;
4444
std::vector<MMseqsParameter*> createnewtaxalist;
45+
std::vector<MMseqsParameter*> classifiedRefiner;
46+
4547

4648
// Superkingdom taxonomy id
4749
PARAMETER(VIRUS_TAX_ID)
@@ -114,6 +116,18 @@ class LocalParameters : public Parameters {
114116
PARAMETER(KMER_BEGIN)
115117
PARAMETER(KMER_END)
116118

119+
// classifiedRefiner
120+
PARAMETER(REMOVE_UNCLASSIFIED)
121+
PARAMETER(EXCLUDE_TAXID)
122+
PARAMETER(SELECT_TAXID)
123+
PARAMETER(SELECT_COLUMNS)
124+
PARAMETER(REPORT)
125+
PARAMETER(RANK)
126+
PARAMETER(HIGHER_RANK_FILE)
127+
128+
129+
130+
117131
// Superkingdom taxonomy id
118132
int virusTaxId;
119133
int bacteriaTaxId;
@@ -187,6 +201,16 @@ class LocalParameters : public Parameters {
187201
size_t kmerBegin;
188202
size_t kmerEnd;
189203

204+
// classified2full
205+
bool removeUnclassified;
206+
std::string excludeTaxid;
207+
std::string selectTaxid;
208+
std::string selectColumns;
209+
bool report;
210+
std::string rank;
211+
int higherRankFile;
212+
213+
190214
void printParameters(const std::string &module, int argc,
191215
const char* pargv[],
192216
const std::vector<MMseqsParameter*> &par);

src/commons/Reporter.cpp

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
#include "Reporter.h"
22
#include "taxonomyreport.cpp"
33

4-
Reporter::Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy) : par(par), taxonomy(taxonomy) {
5-
if (par.targetTaxId != 0) {return;}
6-
if (par.contamList == "") { // classify module
7-
if (par.seqMode == 2) {
8-
outDir = par.filenames[3];
9-
jobId = par.filenames[4];
10-
} else {
11-
outDir = par.filenames[2];
12-
jobId = par.filenames[3];
4+
Reporter::Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy, const std::string &customReportFileName) : par(par), taxonomy(taxonomy) {
5+
if (!customReportFileName.empty()){
6+
reportFileName = customReportFileName;
7+
} else {
8+
if (par.targetTaxId != 0) {return;}
9+
if (par.contamList == "") { // classify module
10+
if (par.seqMode == 2) {
11+
outDir = par.filenames[3];
12+
jobId = par.filenames[4];
13+
} else {
14+
outDir = par.filenames[2];
15+
jobId = par.filenames[3];
16+
}
17+
// Output file names
18+
reportFileName = outDir + + "/" + jobId + "_report.tsv";
19+
readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
1320
}
14-
// Output file names
15-
reportFileName = outDir + + "/" + jobId + "_report.tsv";
16-
readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
1721
}
1822
}
1923

@@ -98,7 +102,7 @@ void Reporter::kronaReport(FILE *FP, const TaxonomyWrapper &taxDB, const std::un
98102
}
99103
}
100104

101-
void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona) {
105+
void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona, const std::string &kronaFileName) {
102106
std::unordered_map<TaxID, std::vector<TaxID>> parentToChildren = taxonomy->getParentToChildren();
103107
unordered_map<TaxID, TaxonCounts> cladeCounts = taxonomy->getCladeCounts(taxCnt, parentToChildren);
104108
FILE *fp;
@@ -109,7 +113,12 @@ void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int
109113

110114
// Write Krona chart
111115
if (krona) {
112-
FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
116+
FILE *kronaFile = nullptr;
117+
if (!kronaFileName.empty()){
118+
kronaFile = fopen(kronaFileName.c_str(), "w");
119+
} else{
120+
kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
121+
}
113122
fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
114123
fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", (size_t) numOfQuery);
115124
kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);

src/commons/Reporter.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ class Reporter {
2323
ofstream readClassificationFile;
2424

2525
public:
26-
Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy);
26+
Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy, const std::string &customReportFileName = "");
2727
// Write report
28-
void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona = true);
28+
void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona = true, const std::string &kronaFileName = "");
2929
void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
3030
unsigned long totalReads, TaxID taxID = 0, int depth = 0);
3131
void kronaReport(FILE *FP, const TaxonomyWrapper &taxDB, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts, unsigned long totalReads, TaxID taxID = 0, int depth = 0);

src/util/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ set(util_source_files
2020
util/editNames.h
2121
util/createnewtaxalist.cpp
2222
util/createnewtaxalist.h
23+
util/classifiedRefiner.cpp
2324
PARENT_SCOPE)

0 commit comments

Comments
 (0)