Skip to content

Commit df1c74d

Browse files
authored
Merge pull request #206 from h-2/TODOs
TODOs
2 parents 282b8fb + 9b36380 commit df1c74d

12 files changed

+269
-276
lines changed

src/mkindex.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ void argConv3a(LambdaIndexerOptions const & options)
156156
return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::MURPHY10>(options);
157157
case AlphabetEnum::LI10:
158158
return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::LI10>(options);
159-
//TODO other reduced alphabets
160159
default:
161160
throw 45;
162161
}

src/mkindex_algo.hpp

Lines changed: 48 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
#include <bio/ranges/views/translate_join.hpp>
3636

3737
#include "mkindex_misc.hpp"
38-
// #include "mkindex_saca.hpp"
3938
#include "shared_definitions.hpp"
4039
#include "shared_misc.hpp"
4140
#include "shared_options.hpp"
@@ -69,7 +68,7 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
6968
// see http://www.uniprot.org/help/accession_numbers
7069
// https://www.ncbi.nlm.nih.gov/Sequin/acc.html
7170
// https://www.ncbi.nlm.nih.gov/refseq/about/
72-
// TODO: make sure these don't trigger twice on one ID
71+
// REMARK: these might trigger twice on one ID
7372
std::regex const accRegEx{
7473
"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}|" // UNIPROT
7574
"[A-Z][0-9]{5}|[A-Z]{2}[0-9]{6}|" // NCBI nucl
@@ -93,7 +92,6 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
9392
assert(accToIdRank.count(it->str()) == 0);
9493
// "An accession number appeared twice in the file, but they should be unique.");
9594

96-
// TODO store acc outside as well
9795
accToIdRank[it->str()] = rank;
9896
}
9997

@@ -145,14 +143,16 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
145143
throw std::runtime_error("ERROR: No sequences in file. Aborting.\n");
146144
}
147145

148-
size_t maxLen = 0ul;
146+
size_t maxLen = 0ul;
147+
size_t lengthSum = 0ul;
149148
for (auto const & s : originalSeqs)
150149
{
151-
if (std::ranges::size(s) > maxLen)
150+
lengthSum += s.size();
151+
if (s.size() > maxLen)
152152
{
153-
maxLen = std::ranges::size(s);
153+
maxLen = s.size();
154154
}
155-
else if (std::ranges::size(s) == 0ul)
155+
else if (s.size() == 0ul)
156156
{
157157
throw std::runtime_error(
158158
"ERROR: Unexpectedly encountered a sequence of length 0 in the file."
@@ -165,6 +165,8 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
165165
std::ranges::size(originalSeqs),
166166
"\nLongest sequence read: ",
167167
maxLen,
168+
"\nSum of all sequence lengths: ",
169+
lengthSum,
168170
"\n");
169171

170172
if (options.hasSTaxIds)
@@ -310,8 +312,6 @@ auto mapTaxIDs(TaccToIdRank const & accToIdRank, uint64_t const numSubjects, Lam
310312

311313
myPrint(options, 2, "Runtime: ", sysTime() - start, "s \n");
312314

313-
// TODO do something with the subjects that have no (valid) taxid?
314-
315315
uint64_t nomap = 0;
316316
uint64_t multi = 0;
317317

@@ -368,34 +368,31 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
368368

369369
double start = sysTime();
370370

371-
std::string buf;
372-
std::regex const numRegEx{"\\b\\d+\\b"};
373-
374-
//TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
375-
bio::io::txt::reader reader{options.taxDumpDir + "/nodes.dmp"};
376-
for (std::string_view line : reader)
371+
bio::io::txt::reader reader{options.taxDumpDir + "/nodes.dmp", '\t'};
372+
for (auto & record : reader)
377373
{
378-
uint32_t n = 0;
379-
uint32_t parent = 0;
380-
unsigned i = 0;
381-
for (auto it = std::cregex_iterator(line.begin(), line.end(), numRegEx), itEnd = std::cregex_iterator();
382-
(it != itEnd) && (i < 2);
383-
++it, ++i)
374+
/* first column (own id) */
375+
uint32_t n = 0;
376+
std::string_view col1 = record.fields[0];
377+
auto res1 = std::from_chars(col1.data(), col1.data() + col1.size(), n);
378+
if (res1.ec != std::errc{})
384379
{
385-
std::string strbuf = it->str();
386-
std::from_chars_result res;
387-
388-
if (i == 0)
389-
res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), n);
390-
else
391-
res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), parent);
380+
throw std::runtime_error{
381+
std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
382+
static_cast<std::string>(col1) + "\n"};
383+
}
392384

393-
if (res.ec != std::errc{})
394-
{
395-
throw std::runtime_error{
396-
std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} + strbuf + "\n"};
397-
}
385+
/* second column (parent id) */
386+
uint32_t parent = 0;
387+
std::string_view col2 = record.fields[2]; // fields[1] is '|'
388+
auto res2 = std::from_chars(col2.data(), col2.data() + col2.size(), parent);
389+
if (res2.ec != std::errc{})
390+
{
391+
throw std::runtime_error{
392+
std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
393+
static_cast<std::string>(col2) + "\n"};
398394
}
395+
399396
if (std::ranges::size(taxonParentIDs) <= n)
400397
taxonParentIDs.resize(n + 1, 0);
401398
taxonParentIDs[n] = parent;
@@ -545,54 +542,37 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
545542

546543
start = sysTime();
547544

548-
std::regex const wordRegEx{R"([\w.,\"<> ]+)"};
549-
std::string name;
550-
551-
//TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
552-
bio::io::txt::reader reader2{options.taxDumpDir + "/names.dmp"};
553-
for (std::string_view line : reader2)
545+
bio::io::txt::reader reader2{options.taxDumpDir + "/names.dmp", '\t'};
546+
for (auto & record : reader2)
554547
{
555-
uint32_t taxId = 0;
548+
assert(record.fields.size() == 8);
556549

557-
auto itWord = std::cregex_iterator(line.begin(), line.end(), wordRegEx);
558-
if (itWord == std::cregex_iterator())
559-
{
560-
throw std::runtime_error("Error: Expected taxonomical ID in first column, but couldn't find it.\n");
561-
}
562-
else
563-
{
564-
std::string strbuf = itWord->str();
565-
std::from_chars_result res;
566-
567-
res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), taxId);
550+
if (record.fields.size() < 6)
551+
continue;
568552

569-
if (res.ec != std::errc{})
553+
if (record.fields[6] == "scientific name")
554+
{
555+
/* first column */
556+
uint32_t taxId = 0;
557+
std::string_view col1 = record.fields[0];
558+
auto res1 = std::from_chars(col1.data(), col1.data() + col1.size(), taxId);
559+
if (res1.ec != std::errc{})
570560
{
571561
throw std::runtime_error{
572-
std::string{"Error: Expected taxonomical ID in first column, but got something I couldn't read: "} +
573-
strbuf + "\n"};
562+
std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
563+
static_cast<std::string>(col1) + "\n"};
574564
}
575565

576566
if (taxId >= std::ranges::size(taxonNames))
577567
{
578568
throw std::runtime_error(std::string("Error: taxonomical ID is ") + std::to_string(taxId) +
579569
", but no such taxon in tree.\n");
580570
}
581-
}
582-
583-
// we don't need this name
584-
if (!taxIdIsPresentOrParent[taxId])
585-
continue;
586571

587-
if (++itWord == std::cregex_iterator())
588-
throw std::runtime_error("Error: Expected name in second column, but couldn't find it.\n");
589-
else
590-
name = itWord->str();
591-
592-
while (++itWord != std::cregex_iterator())
593-
{
594-
if (itWord->str() == "scientific name")
595-
taxonNames[taxId] = name;
572+
/* second column (name) */
573+
if (taxIdIsPresentOrParent[taxId]) // check if we need the name
574+
taxonNames[taxId].assign(record.fields[2].begin(),
575+
record.fields[2].end()); // fields[1] is '|' separator
596576
}
597577
}
598578

src/mkindex_options.hpp

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -92,24 +92,22 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
9292

9393
parser.add_section("Input Options");
9494

95-
// TODO Change file extensions, make more generic
95+
std::vector<std::string> extensions{"fa", "fq", "fasta", "fastq"};
96+
#ifdef SEQAN_HAS_ZLIB
97+
for (auto const & ext : extensions)
98+
extensions.push_back(ext + ".gz");
99+
#endif
96100
parser.add_option(options.dbFile,
97101
sharg::config{.short_id = 'd',
98102
.long_id = "database",
99103
.description = "Database sequences.",
100104
.required = true,
101-
.validator = sharg::input_file_validator{{"fa", "fq", "fasta", "fastq", "gz"}}});
105+
.validator = sharg::input_file_validator{extensions}});
102106

103-
std::vector<std::string> taxExtensions{"accession2taxid", "dat"};
107+
extensions = {"accession2taxid", "dat"};
104108
#ifdef SEQAN_HAS_ZLIB
105-
taxExtensions.push_back("accession2taxid.gz");
106-
taxExtensions.push_back("accession2taxid.bgzf");
107-
taxExtensions.push_back("dat.gz");
108-
taxExtensions.push_back("dat.bgzf");
109-
#endif
110-
#ifdef SEQAN_HAS_BZIP2
111-
taxExtensions.push_back("accession2taxid.bz2");
112-
taxExtensions.push_back("dat.bz2");
109+
for (auto const & ext : extensions)
110+
extensions.push_back(ext + ".gz");
113111
#endif
114112

115113
parser.add_option(
@@ -120,7 +118,7 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
120118
"An NCBI or UniProt accession-to-taxid mapping file. Download from "
121119
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/ or "
122120
"ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/ .",
123-
.validator = sharg::input_file_validator(taxExtensions)});
121+
.validator = sharg::input_file_validator{extensions}});
124122

125123
parser.add_option(options.taxDumpDir,
126124
sharg::config{.short_id = 'x',
@@ -132,13 +130,18 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
132130
parser.add_section("Output Options");
133131

134132
options.indexFilePath = "»INPUT«.lba";
135-
parser.add_option(options.indexFilePath,
136-
sharg::config{
137-
.short_id = 'i',
138-
.long_id = "index",
139-
.description = "The output path for the index file.",
140-
.validator = sharg::output_file_validator{sharg::output_file_open_options::create_new,
141-
{"lba", "lta", "lba.gz", "lta.gz"}}
133+
extensions = {"lba", "lta"};
134+
#ifdef SEQAN_HAS_ZLIB
135+
for (auto const & ext : extensions)
136+
extensions.push_back(ext + ".gz");
137+
#endif
138+
parser.add_option(
139+
options.indexFilePath,
140+
sharg::config{
141+
.short_id = 'i',
142+
.long_id = "index",
143+
.description = "The output path for the index file.",
144+
.validator = sharg::output_file_validator{sharg::output_file_open_options::create_new, {extensions}}
142145
});
143146

144147
options.threads = std::max<size_t>(2ul, std::min<size_t>(std::thread::hardware_concurrency(), 4ul));

src/search.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ void realMain(LambdaOptions const & options)
412412
#ifdef LAMBDA_MICRO_STATS
413413
double buf = sysTime();
414414
#endif
415-
search(localHolder); //TODO seed refining if iterateMatches gives 0 results
415+
search(localHolder);
416416
#ifdef LAMBDA_MICRO_STATS
417417
localHolder.stats.timeSearch += sysTime() - buf;
418418
#endif

0 commit comments

Comments
 (0)