Skip to content

Commit b9ab2a3

Browse files
authored
Merge read-group branch into master (#165)
* first compiling version after merging read-grouping module * now 1-based temporarily * implement SyncmerScanner_dna2aa * optimize read grouping * cadd parameters for ceateingcommon k-mer list * extract reference k-mers from six-frame translations * extract reference k-mers from six-frame translations * code clean * KmerDbReader only reads k-mer values not IDs * add parameters to read grouping module * use only k-mer values during common k-mer filtering * add yxml into lib * WIP: making read grouping flexible to original classification format * fix read index error in reporting updated results * fix tiny-size db creation error * regression test updated * hide developing features * fix errors in benchmark set generation * fix issue 108 * update database moudle
1 parent 0b0cfff commit b9ab2a3

26 files changed

+2402
-309
lines changed

data/metabulidatabases.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ mkdir -p "${OUTDB}"
9292
case "${SELECTION}" in
9393
"GTDB")
9494
if notExists "${TMP_PATH}/gtdb.tar.gz"; then
95-
downloadFile "https://metabuli.steineggerlab.workers.dev/gtdb.tar.gz" "${TMP_PATH}/gtdb.tar.gz"
95+
downloadFile "https://metabuli.steineggerlab.workers.dev/gtdb+virus+human.tar.gz" "${TMP_PATH}/gtdb.tar.gz"
9696
fi
9797
tar zxvf "${TMP_PATH}/gtdb.tar.gz" -C "${OUTDB}"
9898
# push_back "${TMP_PATH}/gtdb"

src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ include_directories(workflow)
33
include_directories(util)
44
include_directories(benchmark)
55
include_directories(uniref)
6+
include_directories(read-group)
67
include_directories(../lib/prodigal)
78
include_directories(../lib/fasta_validator)
89
include_directories(../lib/fastq_utils)
@@ -14,13 +15,15 @@ add_subdirectory(version)
1415
add_subdirectory(workflow)
1516
add_subdirectory(benchmark)
1617
add_subdirectory(uniref)
18+
add_subdirectory(read-group)
1719

1820
add_executable(metabuli
1921
${commons_source_files}
2022
${util_source_files}
2123
${workflow_source_files}
2224
${benchmark_source_files}
2325
${uniref_source_files}
26+
${read_group_source_files}
2427
metabuli.cpp
2528
LocalCommandDeclarations.h
2629
util/filter_by_genus.cpp

src/MetabuliBase.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ std::vector<Command> metabuliCommands = {
2828
{{"Directory where the DB will be generated", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
2929
{"A list of FASTA files", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
3030
{"Mapping file (accession to tax ID)", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
31-
{"create-uniref-db", create_unirefdb, &localPar.buildUnirefDb, COMMAND_DATABASE_CREATION,
31+
{"create-uniref-db", create_unirefdb, &localPar.buildUnirefDb, COMMAND_EXPERT,
3232
"Build a UniRef k-mer database",
3333
nullptr,
3434
"Jaebeom Kim <jbeom0731@gmail.com>",
@@ -37,15 +37,15 @@ std::vector<Command> metabuliCommands = {
3737
{{"Database directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
3838
{"UniRef100 FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
3939
{"UniRef tree DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
40-
{"create-uniref-tree", create_uniref_tree, &localPar.buildUnirefTree, COMMAND_DATABASE_CREATION,
40+
{"create-uniref-tree", create_uniref_tree, &localPar.buildUnirefTree, COMMAND_EXPERT,
4141
"Build a tree of UniRef100/90/50 clusters",
4242
nullptr,
4343
"Jaebeom Kim <jbeom0731@gmail.com>",
4444
"<database directory> <UniRef100 XML file>",
4545
CITATION_SPACEPHARER,
4646
{{"Database directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty},
4747
{"UniRef100 XML file", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
48-
{"create-common-kmer-list", create_common_kmer_list, &localPar.build, COMMAND_EXPERT,
48+
{"create-common-kmer-list", create_common_kmer_list, &localPar.createCommonKmerList, COMMAND_EXPERT,
4949
"Create a list of k-mers that appear in multiple species",
5050
nullptr,
5151
"Jaebeom Kim <jbeom0731@gmail.com>",
@@ -118,7 +118,7 @@ std::vector<Command> metabuliCommands = {
118118
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
119119
{"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory},
120120
{"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
121-
{"assign_uniref", assign_uniref, &localPar.assignUniref, COMMAND_MAIN,
121+
{"assign_uniref", assign_uniref, &localPar.assignUniref, COMMAND_EXPERT,
122122
"Assign UniRef cluster labels to query protein sequences.",
123123
nullptr,
124124
"Jaebeom Kim <jbeom0731@gmail.com>",
@@ -274,6 +274,17 @@ std::vector<Command> metabuliCommands = {
274274
{{"Assembly accessions", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
275275
{"Taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
276276
{"Assembly accession to taxid mapping", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
277+
// {"grouping", groupGeneration, &localPar.groupGeneration, COMMAND_MAIN,
278+
// "Assigning group representative taxonomy label to unclassified query reads",
279+
// nullptr,
280+
// "Luna Sung-eun Jang <jse9512@gmail.com>",
281+
// "<i:query file(s)> <i:common k-mer database> <i: taxonomy directory> <i: read-by-read result> <o: output directory>", // TODO: follow this
282+
// CITATION_SPACEPHARER,
283+
// {{"FASTA/Q", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
284+
// {"common k-mer database", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
285+
// {"taxonomy directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
286+
// {"read-by-read result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
287+
// {"output directory", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
277288
{"count-common-kmers", count_common_kmers, &localPar.expand_diffidx, COMMAND_EXPERT,
278289
"Count k-mers common at each taxonomy level",
279290
nullptr,
@@ -296,7 +307,7 @@ std::vector<DatabaseDownload> externalDownloads = {
296307
},
297308
{
298309
"GTDB",
299-
"GTDB 214.1 (Complete/Chromosome level only, CheckM completeness > 90 and contamination < 5) and a human genome (T2T-CHM13v2.0)",
310+
"GTDB 220 (Complete/Chromosome level only, CheckM completeness > 90 and contamination < 5), a human genome (T2T-CHM13v2.0), RefSeq viruses",
300311
"Donovan et al. GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy. Nucleic Acids Res. (2022)",
301312
"https://gtdb.ecogenomic.org/",
302313
true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,

src/benchmark/makeBenchmarkSet.cpp

Lines changed: 87 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "IndexCreator.h"
2+
#include "editNames.h"
23
#include <iostream>
34
#include <istream>
45
#include <string>
@@ -43,6 +44,8 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
4344
string databaseAssemblyList = assemblyList + ".databaseAssembly";
4445
string totalExludedAssemblyList = assemblyList + ".totalExcludedAssembly";
4546

47+
// editNames(taxonomyPath + "/names.dmp", par.assacc2taxid);
48+
4649
// Load taxonomy
4750
TaxonomyWrapper taxonomy(taxonomyPath + "/names.dmp",
4851
taxonomyPath + "/nodes.dmp",
@@ -52,18 +55,21 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
5255

5356
cout << "Making name2taxid map...";
5457
std::unordered_map<std::string, TaxID> name2InternalTaxId;
58+
std::unordered_map<std::string, TaxID> tempMap;
5559
taxonomy.getName2InternalTaxid(name2InternalTaxId);
5660
for (auto &it : name2InternalTaxId) {
5761
if (it.first.find(".") == std::string::npos) {
5862
continue;
5963
}
6064
string accessionNoVersion = it.first.substr(0, it.first.find("."));
61-
name2InternalTaxId[accessionNoVersion] = it.second;
65+
tempMap[accessionNoVersion] = it.second;
66+
}
67+
for (auto &it : tempMap) {
68+
name2InternalTaxId[it.first] = it.second;
6269
}
6370
cout << "done." << endl;
6471

65-
cout << "Making observedAcc2taxid map...";
66-
std::unordered_map<std::string, TaxID> observedAcc2taxid;
72+
6773
cout << "Loading assembly list...";
6874
vector<string> totalAssemblyAccessions;
6975
vector<Assembly> assemblies;
@@ -72,8 +78,11 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
7278
cerr << "Error: could not open assembly list file " << assemblyList << endl;
7379
return 1;
7480
}
81+
7582
string assemblyAccession;
7683
TaxID taxid;
84+
cout << "Making observedAcc2taxid map...";
85+
std::unordered_map<std::string, TaxID> observedAcc2taxid;
7786
while (getline(assemblyListFile, assemblyAccession)) {
7887
string assAccNoVersion = assemblyAccession.substr(0, assemblyAccession.find("."));
7988

@@ -88,7 +97,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
8897
} else if (name2InternalTaxId.find(assAccNoVersion) != name2InternalTaxId.end()) {
8998
taxid = name2InternalTaxId[assAccNoVersion];
9099
} else {
91-
cerr << "Error: accession " << assemblyAccession << " not found in the taxonomy" << endl;
100+
cerr << "Error: accession " << assemblyAccession << " " << assAccNoVersion << " not found in the taxonomy" << endl;
92101
return 1;
93102
}
94103
observedAcc2taxid[assemblyAccession] = taxid;
@@ -257,7 +266,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
257266
}
258267

259268
// Select n genera with multiple species
260-
n = int(genusWithMultipleSpecies.size() / 4);
269+
n = int(genusWithMultipleSpecies.size() / 3);
261270
cout << "Excluding " << n << " species" << endl;
262271
vector<TaxID> selectedGenera;
263272
for (int i = 0; i < n; i++) {
@@ -314,7 +323,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
314323
}
315324

316325
// 2. Select n species with multiple assemblies
317-
n = int(speciesWithMultipleAssemblies.size()/2);
326+
n = int(speciesWithMultipleAssemblies.size()/3);
318327
cout << "Excluding " << n << " assemblies" << endl;
319328
vector<TaxID> selectedSpecies;
320329
for (int i = 0; i < n; i++) {
@@ -409,17 +418,23 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
409418
const string & rank = taxonomy.getString(lcaTaxon->rankIdx);
410419
if (rank == "order") {
411420
orderCount++;
412-
} else if (taxonomy.findRankIndex(rank) == -1) {
413-
cout << "Error: LCA rank index is -1." << endl;
414-
return 1;
415-
} else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("order")) {
416-
cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid family exclusion. LCA is below order rank." << endl;
417-
cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
421+
continue;
422+
}
423+
424+
if (strcmp(taxonomy.getString(lcaTaxon->nameIdx), "root") == 0) {
425+
continue;
426+
}
427+
428+
if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("order")) {
429+
cout << "Error: " << excludedFamilyAssemblies[i] << " is not a valid family exclusion. LCA is below order rank." << endl;
430+
cout << "Name: " << taxonomy.getString(lcaTaxon->nameIdx) << " TaxID: " << lcaTaxon->taxId << endl;
431+
cout << "Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
432+
cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
418433
return 1;
419434
}
420435
}
421436
if (orderCount == 0) {
422-
cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. No order rank LCA." << endl;
437+
cout << "Error: " << excludedFamilyAssemblies[i] << " is not a valid exclusion. No order rank LCA." << endl;
423438
return 1;
424439
}
425440
}
@@ -438,14 +453,28 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
438453
const string & rank = taxonomy.getString(lcaTaxon->rankIdx);
439454
if (rank == "family") {
440455
familyCount++;
441-
} else if (taxonomy.findRankIndex(rank) == -1) {
442-
cout << "Error: LCA rank index is -1." << endl;
443-
return 1;
444-
} else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("family")) {
445-
cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. LCA is below Family rank." << endl;
446-
cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
447-
return 1;
456+
continue;
457+
}
458+
459+
if (strcmp(taxonomy.getString(lcaTaxon->nameIdx), "root") == 0) {
460+
continue;
448461
}
462+
463+
if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("family")) {
464+
cout << "Error: " << excludedFamilyAssemblies[i] << " is not a valid genus exclusion. LCA is below family rank." << endl;
465+
cout << "Name: " << taxonomy.getString(lcaTaxon->nameIdx) << " TaxID: " << lcaTaxon->taxId << endl;
466+
cout << "Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
467+
cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
468+
return 1;
469+
}
470+
// else if (taxonomy.findRankIndex(rank) == -1) {
471+
// cout << "Error: LCA rank index is -1." << endl;
472+
// return 1;
473+
// } else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("family")) {
474+
// cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. LCA is below Family rank." << endl;
475+
// cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
476+
// return 1;
477+
// }
449478
}
450479
if (familyCount == 0) {
451480
cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. No Family rank LCA." << endl;
@@ -465,16 +494,34 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
465494
TaxID databaseTaxid = observedAcc2taxid[databaseAssemblies[j]];
466495
const TaxonNode * lcaTaxon = taxonomy.taxonNode(taxonomy.LCA(excludedTaxid, databaseTaxid));
467496
const string & rank = taxonomy.getString(lcaTaxon->rankIdx);
497+
468498
if (rank == "genus") {
469499
genusCount++;
470-
} else if (taxonomy.findRankIndex(rank) == -1) {
471-
cout << "Error: LCA rank index is -1." << endl;
472-
return 1;
473-
} else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("genus")) {
474-
cout << "Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. LCA is below Genus rank." << endl;
475-
cout << excludedSpeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
476-
return 1;
500+
continue;
501+
}
502+
503+
if (strcmp(taxonomy.getString(lcaTaxon->nameIdx), "root") == 0) {
504+
continue;
477505
}
506+
507+
if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("genus")) {
508+
cout << "Error: " << excludedFamilyAssemblies[i] << " is not a valid species exclusion. LCA is below genus rank." << endl;
509+
cout << "Name: " << taxonomy.getString(lcaTaxon->nameIdx) << " TaxID: " << lcaTaxon->taxId << endl;
510+
cout << "Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
511+
cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
512+
return 1;
513+
}
514+
515+
// if (rank == "genus") {
516+
// genusCount++;
517+
// } else if (taxonomy.findRankIndex(rank) == -1) {
518+
// cout << "Error: LCA rank index is -1." << endl;
519+
// return 1;
520+
// } else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("genus")) {
521+
// cout << "Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. LCA is below Genus rank." << endl;
522+
// cout << excludedSpeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
523+
// return 1;
524+
// }
478525
}
479526
if (genusCount == 0) {
480527
cout << "Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. No Genus rank LCA." << endl;
@@ -496,11 +543,20 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
496543
const string & rank = taxonomy.getString(lcaTaxon->rankIdx);
497544
if (rank == "species") {
498545
speciesCount++;
499-
} else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("species")) {
500-
cout << "Error: " << excludedSubspeciesAssemblies[i] << " is not a valid exclusion. LCA is below Species rank." << endl;
501-
cout << excludedSubspeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
502-
return 1;
546+
continue;
547+
}
548+
549+
if (strcmp(taxonomy.getString(lcaTaxon->nameIdx), "root") == 0) {
550+
continue;
503551
}
552+
553+
if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("species")) {
554+
cout << "Error: " << excludedFamilyAssemblies[i] << " is not a valid exclusion. LCA is below species rank." << endl;
555+
cout << "Name: " << taxonomy.getString(lcaTaxon->nameIdx) << " TaxID: " << lcaTaxon->taxId << endl;
556+
cout << "Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
557+
cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
558+
return 1;
559+
}
504560
}
505561
if (speciesCount == 0) {
506562
cout << "Error: " << excludedSubspeciesAssemblies[i] << " is not a valid exclusion. No Species rank LCA." << endl;

0 commit comments

Comments
 (0)