Merge pull request #141 from borijoa/refine-merge

jaebeom-kim · web-flow · commit 25c096153090 · 2025-05-14T17:18:55.000+09:00
Add classifiedRefiner
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ Please cite: [Kim J, Steinegger M. Metabuli: sensitive and specific metagenomic
   - [Compile from source code](#compile-from-source-code)
 - [Pre-built databases](#pre-built-databases)
 - [Classification](#classification)
+- [Refine Classification file](#refine-classification-file)
 - [Extract](#extract)
 - [GTDB-based custom database](#gtdb-based-custom-database)
   - [Creat a new database](#creat-a-new-database)
@@ -228,6 +229,32 @@ It is for an interactive taxonomy report (Krona). You can use any modern web bro
 Metabuli can classify reads against a database of any size as long as the database is fits in the hard disk, regardless of the machine's RAM size.
 We tested it with a MacBook Air (2020, M1, 8 GiB), where we classified about 15 M paired-end 150 bp reads (~5 GiB in size) against a database built with ~23K prokaryotic genomes (~69 GiB in size).
 
+---
+## Refine Classification file
+After running the 'classify' command, you can refine classification file with some options.
+
+```
+metabuli classifiedRefiner <i:read-by-read classification> <i:DBDIR> [options]
+
+- read-by-read classification : The JobID_classifications.tsv file generated by the `classify` step.
+- DBDIR : The same DBDIR used in the `classify` step.
+
+* Options
+   --threads : The number of threads to utilize (all by default)
+   --remove-unclassified : Remove unclassified reads
+   --exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685,1234)
+   --select-taxid : Select list of taxids as well as its children (e.g., 1758,9685,1234)
+   --select-columns : Select list of columns with number and handle full lineage as 7 (generated if absent) (e.g., 2,5,7,3)
+   --report : Write report of refined classification file
+   --rank : Adjust classification to the specified rank
+   --rank-file-type : Choose how to handle reads assigned to higher taxonomic ranks when using the --rank option. [0: exclude higher rank, 1: include higher rank, 2: make separate file for higher rank classification]
+
+```
+#### Output
+- refined classification file : `JobID_refined.tsv`
+- report : `JobID_refined_report.tsv`, `JobID_refined_krona.html`
+- higher rank classification file : `_refined_higherRanks.tsv`
+
 ---
 ## Extract 
 After running the `classify` command, you can extract reads that are classified under a specific taxon.
diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h
@@ -25,5 +25,6 @@ extern int taxdump(int argc, const char **argv, const Command& command);
 extern int accession2taxid(int argc, const char **argv, const Command& command);
 extern int editNames(int argc, const char **argv, const Command& command);
 extern int createnewtaxalist(int argc, const char **argv, const Command& command);
+extern int classifiedRefiner(int argc, const char **argv, const Command& command);
 
 #endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H
diff --git a/src/MetabuliBase.cpp b/src/MetabuliBase.cpp
@@ -181,7 +181,15 @@ std::vector<Command> metabuliCommands = {
                  {"FASTA list", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
                  {"new taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
                  {"accession2taxid", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
-                 {"output prefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}
+                 {"output prefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
+        {"classifiedRefiner", classifiedRefiner, &localPar.classifiedRefiner, COMMAND_FORMAT_CONVERSION,
+                "Generates refined classification file from classified file",
+                nullptr,
+                "Siyoung Choi <remy0502@snu.ac.kr>",
+                "<i: classified file> <i: taxonomy dump>",
+                CITATION_SPACEPHARER,
+                {{"classified file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
+                {"taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}
 };
 
 std::vector<KmerThreshold> externalThreshold = {};
diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp
@@ -315,8 +315,8 @@ LocalParameters::LocalParameters() :
                      "^[0-9]+$"),
         PRINT_COLUMNS(PRINT_COLUMNS_ID,
                       "--print-columns",
-                      "CSV of column numbers to be printed",
-                      "CSV of column numbers to be printed",
+                      "CSV of columns to print",
+                      "CSV of columns to print",
                       typeid(std::string),
                       (void *) &printColumns,
                       "^.*$"),
@@ -375,7 +375,59 @@ LocalParameters::LocalParameters() :
                 "Last k-mer to print",
                 typeid(size_t),
                 (void *) &kmerEnd,
-                "^[0-9]+$")
+                "^[0-9]+$"),
+        REMOVE_UNCLASSIFIED(REMOVE_UNCLASSIFIED_ID,
+                "--remove-unclassified",
+                "Remove unclassified reads",
+                "Remove unclassified reads",
+                typeid(bool),
+                (void *) &removeUnclassified,
+                ""),
+        EXCLUDE_TAXID(EXCLUDE_TAXID_ID,
+                "--exclude-taxid",
+                "Exclude taxId as well as its children",
+                "Exclude taxId as well as its children",
+                typeid(std::string),
+                (void *) &excludeTaxid,
+                "^.*$"),
+        SELECT_TAXID(SELECT_TAXID_ID,
+                "--select-taxid",
+                "Select taxId as well as its children",
+                "Select taxId as well as its children",
+                typeid(std::string),
+                (void *) &selectTaxid,
+                "^.*$"),
+        SELECT_COLUMNS(SELECT_COLUMNS_ID,
+                "--select-columns",
+                "Select columns with number, (7:full lineage, generated if absent)",
+                "Select columns with number, (7:full lineage, generated if absent)",
+                typeid(std::string),
+                (void *) &selectColumns,
+                "^.*$"),
+        REPORT(REPORT_ID,
+                "--report",
+                "Make report of refined classification file",
+                "Make report of refined classification file",
+                typeid(bool),
+                (void *) &report,
+                ""),
+        RANK(RANK_ID,
+                "--rank",
+                "Adjust classification to the specified rank",
+                "Adjust classification to the specified rank",
+                typeid(std::string),
+                (void *) &rank,
+                "^.*$"),
+        HIGHER_RANK_FILE(HIGHER_RANK_FILE_ID,
+                "--rank-file-type",
+                "0: without higher rank, 1: with higher rank, 2: separate file for higher rank classification",
+                "0: without higher rank, 1: with higher rank, 2: separate file for higher rank classification",
+                typeid(int),
+                (void *) &higherRankFile,
+                "^[0-2]$")
+            
+        
+        
   {
     // Initialize the parameters
     // Superkingdom taxonomy id
@@ -429,6 +481,17 @@ LocalParameters::LocalParameters() :
     printMode = 0;
     contamList = "";
 
+    // classified to full taxonomy
+    removeUnclassified = false;
+    excludeTaxid = "";
+    selectTaxid = "";
+    selectColumns = "";
+    report = false;
+    rank = "";
+    higherRankFile = 0;
+
+
+
 
     // build
     build.push_back(&PARAM_THREADS);
@@ -550,6 +613,18 @@ LocalParameters::LocalParameters() :
     expand_diffidx.push_back(&KMER_END);
 
     query2reference.push_back(&TEST_RANK);
+
+    //classified2full
+    classifiedRefiner.push_back(&REMOVE_UNCLASSIFIED);
+    classifiedRefiner.push_back(&EXCLUDE_TAXID);
+    classifiedRefiner.push_back(&SELECT_TAXID);
+    classifiedRefiner.push_back(&SELECT_COLUMNS);
+    classifiedRefiner.push_back(&REPORT);
+    classifiedRefiner.push_back(&RANK);
+    classifiedRefiner.push_back(&HIGHER_RANK_FILE);
+    classifiedRefiner.push_back(&PARAM_THREADS);
+    
+
 }
 
 void LocalParameters::printParameters(const std::string &module, int argc, const char* pargv[],
diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h
@@ -42,6 +42,8 @@ class LocalParameters : public Parameters {
     std::vector<MMseqsParameter*> accession2taxid;
     std::vector<MMseqsParameter*> editNames;
     std::vector<MMseqsParameter*> createnewtaxalist;
+    std::vector<MMseqsParameter*> classifiedRefiner;
+    
 
     // Superkingdom taxonomy id
     PARAMETER(VIRUS_TAX_ID)
@@ -114,6 +116,18 @@ class LocalParameters : public Parameters {
     PARAMETER(KMER_BEGIN)
     PARAMETER(KMER_END)
 
+    // classifiedRefiner
+    PARAMETER(REMOVE_UNCLASSIFIED)
+    PARAMETER(EXCLUDE_TAXID)
+    PARAMETER(SELECT_TAXID)
+    PARAMETER(SELECT_COLUMNS)
+    PARAMETER(REPORT)
+    PARAMETER(RANK)
+    PARAMETER(HIGHER_RANK_FILE)
+
+
+
+
     // Superkingdom taxonomy id
     int virusTaxId;
     int bacteriaTaxId;
@@ -187,6 +201,16 @@ class LocalParameters : public Parameters {
     size_t kmerBegin;
     size_t kmerEnd;
 
+    // classified2full
+    bool removeUnclassified;
+    std::string excludeTaxid;
+    std::string selectTaxid;
+    std::string selectColumns;
+    bool report;
+    std::string rank;
+    int higherRankFile;
+   
+
     void printParameters(const std::string &module, int argc,
                          const char* pargv[],
                          const std::vector<MMseqsParameter*> &par);
diff --git a/src/commons/Reporter.cpp b/src/commons/Reporter.cpp
@@ -1,19 +1,23 @@
 #include "Reporter.h"
 #include "taxonomyreport.cpp"
 
-Reporter::Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy) : par(par), taxonomy(taxonomy) {
-    if (par.targetTaxId != 0) {return;}
-    if (par.contamList == "") { // classify module
-        if (par.seqMode == 2) {
-            outDir = par.filenames[3];
-            jobId = par.filenames[4];
-        } else {
-            outDir = par.filenames[2];
-            jobId = par.filenames[3];
+Reporter::Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy, const std::string &customReportFileName) : par(par), taxonomy(taxonomy) {
+    if (!customReportFileName.empty()){
+        reportFileName = customReportFileName;
+    } else {
+        if (par.targetTaxId != 0) {return;}
+        if (par.contamList == "") { // classify module
+            if (par.seqMode == 2) {
+                outDir = par.filenames[3];
+                jobId = par.filenames[4];
+            } else {
+                outDir = par.filenames[2];
+                jobId = par.filenames[3];
+            }
+            // Output file names
+            reportFileName = outDir + + "/" + jobId + "_report.tsv";
+            readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
         }
-        // Output file names
-        reportFileName = outDir + + "/" + jobId + "_report.tsv";
-        readClassificationFileName = outDir + "/" + jobId + "_classifications.tsv";
     }    
 }
 
@@ -98,7 +102,7 @@ void Reporter::kronaReport(FILE *FP, const TaxonomyWrapper &taxDB, const std::un
     }
 }
 
-void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona) {
+void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona, const std::string &kronaFileName) {
     std::unordered_map<TaxID, std::vector<TaxID>> parentToChildren = taxonomy->getParentToChildren();
     unordered_map<TaxID, TaxonCounts> cladeCounts = taxonomy->getCladeCounts(taxCnt, parentToChildren);
     FILE *fp;
@@ -109,7 +113,12 @@ void Reporter::writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int
 
     // Write Krona chart
     if (krona) {
-        FILE *kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
+        FILE *kronaFile = nullptr;
+        if (!kronaFileName.empty()){
+            kronaFile = fopen(kronaFileName.c_str(), "w");
+        } else{
+            kronaFile = fopen((outDir + "/" + jobId + "_krona.html").c_str(), "w");
+        }
         fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), kronaFile);
         fprintf(kronaFile, "<node name=\"all\"><magnitude><val>%zu</val></magnitude>", (size_t) numOfQuery);
         kronaReport(kronaFile, *taxonomy, cladeCounts, numOfQuery);
diff --git a/src/commons/Reporter.h b/src/commons/Reporter.h
@@ -23,9 +23,9 @@ class Reporter {
     ofstream readClassificationFile;
 
 public:
-    Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy);
+    Reporter(const LocalParameters &par, TaxonomyWrapper *taxonomy, const std::string &customReportFileName = "");
     // Write report
-    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona = true);
+    void writeReportFile(int numOfQuery, unordered_map<TaxID, unsigned int> &taxCnt, bool krona = true, const std::string &kronaFileName = "");
     void writeReport(FILE *FP, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts,
                      unsigned long totalReads, TaxID taxID = 0, int depth = 0);
     void kronaReport(FILE *FP, const TaxonomyWrapper &taxDB, const std::unordered_map<TaxID, TaxonCounts> &cladeCounts, unsigned long totalReads, TaxID taxID = 0, int depth = 0);
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -20,4 +20,5 @@ set(util_source_files
         util/editNames.h
         util/createnewtaxalist.cpp
         util/createnewtaxalist.h
+        util/classifiedRefiner.cpp
         PARENT_SCOPE)
diff --git a/src/util/classifiedRefiner.cpp b/src/util/classifiedRefiner.cpp