Skip to content

Commit 56836b6

Browse files
authored
Merge pull request #145 from jaebeom-kim/master
- Fix a bug of printing header line multiple times - Improve command grouping - add --validate-db option - use sse4.1 and ips4o for windows os.
2 parents e2fbe31 + 9403a58 commit 56836b6

23 files changed

+298
-73
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ metabuli classify --seq-mode 3 read.fna dbdir outdir jobid
163163
164164
* Important parameters:
165165
--validate-input : Validate query file format (0 by default)
166+
--validate-db : Validate DB files (0 by default)
166167
--threads : The number of threads used (all by default)
167168
--max-ram : The maximum RAM usage. (128 GiB by default)
168169
--min-score : The minimum score to be classified
@@ -313,6 +314,7 @@ metabuli build --gtdb 1 <DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> --taxonomy
313314
--accession-level : Set 1 to creat a DB for accession level classification (0 by default).
314315
--cds-info : List of absolute paths to CDS files.
315316
--validate-input : Validate FASTA file format (0 by default)
317+
--validate-db : Validate created DB files (0 by default)
316318
317319
```
318320
This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files.
@@ -340,6 +342,7 @@ metabuli updateDB --gtdb 1 <NEW DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> <OL
340342
--accession-level: Set 1 to add new sequences for accession level classification (0 by default).
341343
--cds-info: List of absolute paths to CDS files.
342344
--validate-input : Validate FASTA file format (0 by default)
345+
--validate-db : Validate created DB files (0 by default)
343346
```
344347

345348
#### \<Add sequences of new taxa>
@@ -429,6 +432,7 @@ metabuli build <DBDIR> <FASTA_LIST> <accession2taxid> --taxonomy-path <TAXDUMP>
429432
--accession-level: Set 1 to creat a DB for accession level classification (0 by default).
430433
--cds-info: List of absolute paths to CDS files.
431434
--validate-input : Validate FASTA file format (0 by default)
435+
--validate-db : Validate created DB files (0 by default)
432436
```
433437
This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files and `DATE-TIME` folder (e.g., `2025-1-24-10-32`) if generated.
434438

@@ -465,6 +469,7 @@ metabuli updateDB <NEW DBDIR> <FASTA_LIST> <accession2taxid> <OLD DBDIR> [option
465469
--make-library : Make species library for faster execution (1 by default).
466470
--new-taxa : List of new taxa to be added.
467471
--validate-input : Validate FASTA file format (0 by default)
472+
--validate-db : Validate created DB files (0 by default)
468473
```
469474
470475
#### \<Add sequences of new taxa> - Please refer [this section](#add-sequences-of-new-taxa).

azure-pipelines.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ jobs:
214214
# %BUILD_SOURCESDIRECTORY%\build\metabuli\metabuli.bat version
215215
# displayName: Setup Busybox
216216
- script: |
217-
%CYGWIN_ROOT%\bin\bash.exe -cl "${BUILD_SOURCESDIRECTORY}/util/Metabuli-regression/run_regression.sh $(cygpath ${BUILD_SOURCESDIRECTORY}/build/build_sse2/src/metabuli.exe) $(cygpath ${BUILD_SOURCESDIRECTORY}/examples)"
217+
%CYGWIN_ROOT%\bin\bash.exe -cl "${BUILD_SOURCESDIRECTORY}/util/Metabuli-regression/run_regression.sh $(cygpath ${BUILD_SOURCESDIRECTORY}/build/build_sse41/src/metabuli.exe) $(cygpath ${BUILD_SOURCESDIRECTORY}/examples)"
218218
displayName: Run Regression Suite
219219
condition: eq(variables['regression'], 1)
220220

lib/fastq_utils/fastq.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ gzFile fastq_open(const char* filename,const char *mode);
6262
static void fastq_close(gzFile fd);
6363

6464
void fastq_print_version() {
65-
fprintf(stderr,"fastq_utils %s\n",VERSION);
65+
fprintf(stderr,"fastq_utils %s\n",FASTQ_UTIL_VERSION);
6666
}
6767

6868

lib/fastq_utils/fastq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#
2121
# =========================================================
2222
*/
23-
#define VERSION "0.25.2"
23+
#define FASTQ_UTIL_VERSION "0.25.2"
2424

2525
#define DEFAULT 0
2626
#define CASAVA18 1

lib/mmseqs

src/LocalCommandDeclarations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,6 @@ extern int accession2taxid(int argc, const char **argv, const Command& command);
2626
extern int editNames(int argc, const char **argv, const Command& command);
2727
extern int createnewtaxalist(int argc, const char **argv, const Command& command);
2828
extern int classifiedRefiner(int argc, const char **argv, const Command& command);
29+
extern int validateDatabase(int argc, const char **argv, const Command& command);
2930

3031
#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H

src/MetabuliBase.cpp

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ std::vector<Command> metabuliCommands = {
2020
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
2121
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
2222
{"build", build, &localPar.build, COMMAND_DATABASE_CREATION,
23-
"Build database based on the list of FASTA files.",
23+
"Build database based on the list of FASTA files",
2424
nullptr,
2525
"Jaebeom Kim <jbeom0731@gmail.com>",
26-
"<DB dir> <FASTA list> <Accesssion2taxid>",
26+
"<database directory> <FASTA list> <accesssion2taxid>",
2727
CITATION_SPACEPHARER,
2828
{{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
2929
{"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
3030
{"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
3131
{"updateDB", updateDB, &localPar.updateDB, COMMAND_DATABASE_CREATION,
32-
"Add new sequences to the database",
32+
"Add new sequences to an existing database",
3333
nullptr,
3434
"Jaebeom Kim <jbeom0731@gmail.com>",
3535
" <new database directory> <FASTA list> <accesssion2taxid> <old database directory>",
@@ -59,15 +59,15 @@ std::vector<Command> metabuliCommands = {
5959
"<info>",
6060
CITATION_SPACEPHARER,
6161
{{"info", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
62-
{"database-report", databaseReport, &localPar.databaseReport, COMMAND_DB,
63-
"It generates a report of taxa in a database.",
62+
{"database-report", databaseReport, &localPar.databaseReport, COMMAND_DATABASE_CREATION,
63+
"Generate a taxonomy report of a database",
6464
nullptr,
6565
"Jaebeom Kim <jbeom0731@gmail.com>",
66-
"<i: DBDIR> ",
66+
"<i: database directory> ",
6767
CITATION_SPACEPHARER,
6868
{{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
6969
{"classify", classify, &localPar.classify, COMMAND_MAIN,
70-
"Assigning taxonomy label to query reads",
70+
"Assign taxonomic labels to query reads",
7171
nullptr,
7272
"Jaebeom Kim <jbeom0731@gmail.com>",
7373
"<i:query file(s)> <i:database directory> <o:output directory> <job ID> ",
@@ -76,8 +76,8 @@ std::vector<Command> metabuliCommands = {
7676
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
7777
{"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
7878
{"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
79-
{"extract", extract, &localPar.extract, COMMAND_MAIN,
80-
"It extracts reads classified into a certain taxon. It should be used after classification.",
79+
{"extract", extract, &localPar.extract, COMMAND_FORMAT_CONVERSION,
80+
"Extract reads classified to a certain taxon (Used after classification)",
8181
nullptr,
8282
"Jaebeom Kim <jbeom0731@gmail.com>",
8383
"<i:query file(s)> <i:read-by-read result> <i:database directory>",
@@ -114,7 +114,7 @@ std::vector<Command> metabuliCommands = {
114114
// {"NCBI style accession2taxid file. It should be consistent to tax dump files.", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
115115
// {"DB directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
116116
{"make-report", binning2report, &localPar.binning2report, COMMAND_FORMAT_CONVERSION,
117-
"It generates Kraken style report file from binning results",
117+
"Generate a Kraken-style taxonomy report using read-by-read classifications",
118118
nullptr,
119119
"Jaebeom Kim <jbeom0731@gmail.com>",
120120
"<i:Binning Result> <o:OUT DIR> <o:JOB ID> <i: TAXONOMY DIR> ",
@@ -183,13 +183,20 @@ std::vector<Command> metabuliCommands = {
183183
{"accession2taxid", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
184184
{"output prefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
185185
{"classifiedRefiner", classifiedRefiner, &localPar.classifiedRefiner, COMMAND_FORMAT_CONVERSION,
186-
"Generates refined classification file from classified file",
186+
"Refine read-by-read classification file",
187187
nullptr,
188188
"Siyoung Choi <remy0502@snu.ac.kr>",
189189
"<i: classified file> <i: taxonomy dump>",
190190
CITATION_SPACEPHARER,
191191
{{"classified file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
192-
{"taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}
192+
{"taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
193+
{"validatedb", validateDatabase, &localPar.validateDatabase, COMMAND_DATABASE_CREATION,
194+
"Validate a database",
195+
nullptr,
196+
"Jaebeom Kim <jbeom0731@gmail.com>",
197+
"<i: database directory>",
198+
CITATION_SPACEPHARER,
199+
{{"database directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}
193200
};
194201

195202
std::vector<KmerThreshold> externalThreshold = {};

src/commons/Classifier.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ void Classifier::startClassify(const LocalParameters &par) {
5454

5555
// Get splits for remaining sequences
5656
if (tries == 1) {
57-
cout << "Indexing query file ...";
57+
cout << "Indexing query file ..." << std::flush;
5858
}
5959
queryIndexer->setBytesPerKmer(matchPerKmer);
6060
queryIndexer->indexQueryFile(processedReadCnt);
@@ -124,15 +124,12 @@ void Classifier::startClassify(const LocalParameters &par) {
124124

125125
numOfTatalQueryKmerCnt += queryKmerBuffer.startIndexOfReserve;
126126
} else { // search was incomplete
127-
// Increase matchPerKmer and try again
127+
// increase matchPerKmer and try again
128128
matchPerKmer += 4;
129-
// delete kseq1;
130-
// delete kseq2;
131129
cout << "--match-per-kmer was increased to " << matchPerKmer << " and searching again..." << endl;
132-
// cout << "The search was incomplete. Increasing --match-per-kmer to " << matchPerKmer << " and trying again..." << endl;
133130
break;
134131
}
135-
}
132+
}
136133

137134
delete kseq1;
138135
if (par.seqMode == 2) {

src/commons/KmerMatcher.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ KmerMatcher::~KmerMatcher() {
2626
}
2727

2828
void KmerMatcher::loadTaxIdList(const LocalParameters & par) {
29-
cout << "Loading the list for taxonomy IDs ... ";
29+
cout << "Loading the list for taxonomy IDs ... " << std::flush;
3030
if (par.contamList != "") {
3131
vector<string> contams = Util::split(par.contamList, ",");
3232
for (auto &contam : contams) {

src/commons/LocalParameters.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ LocalParameters::LocalParameters() :
3939
typeid(int),
4040
(void *) &skipRedundancy,
4141
"[0-1]"),
42+
VALIDATE_DB(VALIDATE_DB_ID,
43+
"--validate-db",
44+
"Validate the database",
45+
"Validate the database. It checks if all required files are present and if the k-mer count is consistent.",
46+
typeid(int),
47+
(void *) &validateDb,
48+
"[0-1]"),
4249
SEQ_MODE(SEQ_MODE_ID,
4350
"--seq-mode",
4451
"Sequencing type",
@@ -514,6 +521,7 @@ LocalParameters::LocalParameters() :
514521
build.push_back(&MAKE_LIBRARY);
515522
build.push_back(&GTDB);
516523
build.push_back(&VALIDATE_INPUT);
524+
build.push_back(&VALIDATE_DB);
517525

518526
// updateDB
519527
updateDB.push_back(&PARAM_THREADS);
@@ -529,6 +537,7 @@ LocalParameters::LocalParameters() :
529537
updateDB.push_back(&MAKE_LIBRARY);
530538
updateDB.push_back(&GTDB);
531539
updateDB.push_back(&VALIDATE_INPUT);
540+
updateDB.push_back(&VALIDATE_DB);
532541

533542
//classify
534543
classify.push_back(&PARAM_THREADS);
@@ -549,6 +558,7 @@ LocalParameters::LocalParameters() :
549558
classify.push_back(&SKIP_REDUNDANCY);
550559
classify.push_back(&PRINT_LINEAGE);
551560
classify.push_back(&VALIDATE_INPUT);
561+
classify.push_back(&VALIDATE_DB);
552562

553563
// extract
554564
extract.push_back(&TAXONOMY_PATH);

0 commit comments

Comments
 (0)