3535#include < bio/ranges/views/translate_join.hpp>
3636
3737#include " mkindex_misc.hpp"
38- // #include "mkindex_saca.hpp"
3938#include " shared_definitions.hpp"
4039#include " shared_misc.hpp"
4140#include " shared_options.hpp"
@@ -69,7 +68,7 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
6968 // see http://www.uniprot.org/help/accession_numbers
7069 // https://www.ncbi.nlm.nih.gov/Sequin/acc.html
7170 // https://www.ncbi.nlm.nih.gov/refseq/about/
72- // TODO: make sure these don't trigger twice on one ID
71+ // REMARK: these might trigger twice on one ID
7372 std::regex const accRegEx{
7473 " [OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}|" // UNIPROT
7574 " [A-Z][0-9]{5}|[A-Z]{2}[0-9]{6}|" // NCBI nucl
@@ -93,7 +92,6 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
9392 assert (accToIdRank.count (it->str ()) == 0 );
9493 // "An accession number appeared twice in the file, but they should be unique.");
9594
96- // TODO store acc outside as well
9795 accToIdRank[it->str ()] = rank;
9896 }
9997
@@ -145,14 +143,16 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
145143 throw std::runtime_error (" ERROR: No sequences in file. Aborting.\n " );
146144 }
147145
148- size_t maxLen = 0ul ;
146+ size_t maxLen = 0ul ;
147+ size_t lengthSum = 0ul ;
149148 for (auto const & s : originalSeqs)
150149 {
151- if (std::ranges::size (s) > maxLen)
150+ lengthSum += s.size ();
151+ if (s.size () > maxLen)
152152 {
153- maxLen = std::ranges:: size (s );
153+ maxLen = s. size ();
154154 }
155- else if (std::ranges:: size (s ) == 0ul )
155+ else if (s. size () == 0ul )
156156 {
157157 throw std::runtime_error (
158158 " ERROR: Unexpectedly encountered a sequence of length 0 in the file."
@@ -165,6 +165,8 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
165165 std::ranges::size (originalSeqs),
166166 " \n Longest sequence read: " ,
167167 maxLen,
168+ " \n Sum of all sequence lengths: " ,
169+ lengthSum,
168170 " \n " );
169171
170172 if (options.hasSTaxIds )
@@ -310,8 +312,6 @@ auto mapTaxIDs(TaccToIdRank const & accToIdRank, uint64_t const numSubjects, Lam
310312
311313 myPrint (options, 2 , " Runtime: " , sysTime () - start, " s \n " );
312314
313- // TODO do something with the subjects that have no (valid) taxid?
314-
315315 uint64_t nomap = 0 ;
316316 uint64_t multi = 0 ;
317317
@@ -368,34 +368,31 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
368368
369369 double start = sysTime ();
370370
371- std::string buf;
372- std::regex const numRegEx{" \\ b\\ d+\\ b" };
373-
374- // TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
375- bio::io::txt::reader reader{options.taxDumpDir + " /nodes.dmp" };
376- for (std::string_view line : reader)
371+ bio::io::txt::reader reader{options.taxDumpDir + " /nodes.dmp" , ' \t ' };
372+ for (auto & record : reader)
377373 {
378- uint32_t n = 0 ;
379- uint32_t parent = 0 ;
380- unsigned i = 0 ;
381- for (auto it = std::cregex_iterator (line.begin (), line.end (), numRegEx), itEnd = std::cregex_iterator ();
382- (it != itEnd) && (i < 2 );
383- ++it, ++i)
374+ /* first column (own id) */
375+ uint32_t n = 0 ;
376+ std::string_view col1 = record.fields [0 ];
377+ auto res1 = std::from_chars (col1.data (), col1.data () + col1.size (), n);
378+ if (res1.ec != std::errc{})
384379 {
385- std::string strbuf = it->str ();
386- std::from_chars_result res;
387-
388- if (i == 0 )
389- res = std::from_chars (strbuf.data (), strbuf.data () + strbuf.size (), n);
390- else
391- res = std::from_chars (strbuf.data (), strbuf.data () + strbuf.size (), parent);
380+ throw std::runtime_error{
381+ std::string{" Error: Expected taxonomical ID, but got something I couldn't read: " } +
382+ static_cast <std::string>(col1) + " \n " };
383+ }
392384
393- if (res.ec != std::errc{})
394- {
395- throw std::runtime_error{
396- std::string{" Error: Expected taxonomical ID, but got something I couldn't read: " } + strbuf + " \n " };
397- }
385+ /* second column (parent id) */
386+ uint32_t parent = 0 ;
387+ std::string_view col2 = record.fields [2 ]; // fields[1] is '|'
388+ auto res2 = std::from_chars (col2.data (), col2.data () + col2.size (), parent);
389+ if (res2.ec != std::errc{})
390+ {
391+ throw std::runtime_error{
392+ std::string{" Error: Expected taxonomical ID, but got something I couldn't read: " } +
393+ static_cast <std::string>(col2) + " \n " };
398394 }
395+
399396 if (std::ranges::size (taxonParentIDs) <= n)
400397 taxonParentIDs.resize (n + 1 , 0 );
401398 taxonParentIDs[n] = parent;
@@ -545,54 +542,37 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
545542
546543 start = sysTime ();
547544
548- std::regex const wordRegEx{R"( [\w.,\"<> ]+)" };
549- std::string name;
550-
551- // TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
552- bio::io::txt::reader reader2{options.taxDumpDir + " /names.dmp" };
553- for (std::string_view line : reader2)
545+ bio::io::txt::reader reader2{options.taxDumpDir + " /names.dmp" , ' \t ' };
546+ for (auto & record : reader2)
554547 {
555- uint32_t taxId = 0 ;
548+ assert (record. fields . size () == 8 ) ;
556549
557- auto itWord = std::cregex_iterator (line.begin (), line.end (), wordRegEx);
558- if (itWord == std::cregex_iterator ())
559- {
560- throw std::runtime_error (" Error: Expected taxonomical ID in first column, but couldn't find it.\n " );
561- }
562- else
563- {
564- std::string strbuf = itWord->str ();
565- std::from_chars_result res;
566-
567- res = std::from_chars (strbuf.data (), strbuf.data () + strbuf.size (), taxId);
550+ if (record.fields .size () < 6 )
551+ continue ;
568552
569- if (res.ec != std::errc{})
553+ if (record.fields [6 ] == " scientific name" )
554+ {
555+ /* first column */
556+ uint32_t taxId = 0 ;
557+ std::string_view col1 = record.fields [0 ];
558+ auto res1 = std::from_chars (col1.data (), col1.data () + col1.size (), taxId);
559+ if (res1.ec != std::errc{})
570560 {
571561 throw std::runtime_error{
572- std::string{" Error: Expected taxonomical ID in first column , but got something I couldn't read: " } +
573- strbuf + " \n " };
562+ std::string{" Error: Expected taxonomical ID, but got something I couldn't read: " } +
563+ static_cast <std::string>(col1) + " \n " };
574564 }
575565
576566 if (taxId >= std::ranges::size (taxonNames))
577567 {
578568 throw std::runtime_error (std::string (" Error: taxonomical ID is " ) + std::to_string (taxId) +
579569 " , but no such taxon in tree.\n " );
580570 }
581- }
582-
583- // we don't need this name
584- if (!taxIdIsPresentOrParent[taxId])
585- continue ;
586571
587- if (++itWord == std::cregex_iterator ())
588- throw std::runtime_error (" Error: Expected name in second column, but couldn't find it.\n " );
589- else
590- name = itWord->str ();
591-
592- while (++itWord != std::cregex_iterator ())
593- {
594- if (itWord->str () == " scientific name" )
595- taxonNames[taxId] = name;
572+ /* second column (name) */
573+ if (taxIdIsPresentOrParent[taxId]) // check if we need the name
574+ taxonNames[taxId].assign (record.fields [2 ].begin (),
575+ record.fields [2 ].end ()); // fields[1] is '|' separator
596576 }
597577 }
598578
0 commit comments