seqan
diff --git a/‎src/mkindex.cpp‎
Lines changed: 0 additions & 1 deletion b/‎src/mkindex.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/mkindex_algo.hpp‎
Lines changed: 48 additions & 68 deletions b/‎src/mkindex_algo.hpp‎
Lines changed: 48 additions & 68 deletions
diff --git a/‎src/mkindex_options.hpp‎
Lines changed: 22 additions & 19 deletions b/‎src/mkindex_options.hpp‎
Lines changed: 22 additions & 19 deletions
diff --git a/‎src/search.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/search.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -156,7 +156,6 @@ void argConv3a(LambdaIndexerOptions const & options)
             return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::MURPHY10>(options);
         case AlphabetEnum::LI10:
             return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::LI10>(options);
-        //TODO other reduced alphabets
         default:
             throw 45;
     }
 
@@ -35,7 +35,6 @@
 #include <bio/ranges/views/translate_join.hpp>
 
 #include "mkindex_misc.hpp"
-// #include "mkindex_saca.hpp"
 #include "shared_definitions.hpp"
 #include "shared_misc.hpp"
 #include "shared_options.hpp"
@@ -69,7 +68,7 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
     // see http://www.uniprot.org/help/accession_numbers
     // https://www.ncbi.nlm.nih.gov/Sequin/acc.html
     // https://www.ncbi.nlm.nih.gov/refseq/about/
-    // TODO: make sure these don't trigger twice on one ID
+    // REMARK: these might trigger twice on one ID
     std::regex const accRegEx{
       "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}|" // UNIPROT
       "[A-Z][0-9]{5}|[A-Z]{2}[0-9]{6}|"                                       // NCBI nucl
@@ -93,7 +92,6 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
             assert(accToIdRank.count(it->str()) == 0);
             //                              "An accession number appeared twice in the file, but they should be unique.");
 
-            // TODO store acc outside as well
             accToIdRank[it->str()] = rank;
         }
 
@@ -145,14 +143,16 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
         throw std::runtime_error("ERROR: No sequences in file. Aborting.\n");
     }
 
-    size_t maxLen = 0ul;
+    size_t maxLen    = 0ul;
+    size_t lengthSum = 0ul;
     for (auto const & s : originalSeqs)
     {
-        if (std::ranges::size(s) > maxLen)
+        lengthSum += s.size();
+        if (s.size() > maxLen)
         {
-            maxLen = std::ranges::size(s);
+            maxLen = s.size();
         }
-        else if (std::ranges::size(s) == 0ul)
+        else if (s.size() == 0ul)
         {
             throw std::runtime_error(
               "ERROR: Unexpectedly encountered a sequence of length 0 in the file."
@@ -165,6 +165,8 @@ auto loadSubjSeqsAndIds(LambdaIndexerOptions const & options)
             std::ranges::size(originalSeqs),
             "\nLongest sequence read: ",
             maxLen,
+            "\nSum of all sequence lengths: ",
+            lengthSum,
             "\n");
 
     if (options.hasSTaxIds)
@@ -310,8 +312,6 @@ auto mapTaxIDs(TaccToIdRank const & accToIdRank, uint64_t const numSubjects, Lam
 
     myPrint(options, 2, "Runtime: ", sysTime() - start, "s \n");
 
-    // TODO do something with the subjects that have no (valid) taxid?
-
     uint64_t nomap = 0;
     uint64_t multi = 0;
 
@@ -368,34 +368,31 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
 
     double start = sysTime();
 
-    std::string      buf;
-    std::regex const numRegEx{"\\b\\d+\\b"};
-
-    //TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
-    bio::io::txt::reader reader{options.taxDumpDir + "/nodes.dmp"};
-    for (std::string_view line : reader)
+    bio::io::txt::reader reader{options.taxDumpDir + "/nodes.dmp", '\t'};
+    for (auto & record : reader)
     {
-        uint32_t n      = 0;
-        uint32_t parent = 0;
-        unsigned i      = 0;
-        for (auto it = std::cregex_iterator(line.begin(), line.end(), numRegEx), itEnd = std::cregex_iterator();
-             (it != itEnd) && (i < 2);
-             ++it, ++i)
+        /* first column (own id) */
+        uint32_t         n    = 0;
+        std::string_view col1 = record.fields[0];
+        auto             res1 = std::from_chars(col1.data(), col1.data() + col1.size(), n);
+        if (res1.ec != std::errc{})
         {
-            std::string            strbuf = it->str();
-            std::from_chars_result res;
-
-            if (i == 0)
-                res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), n);
-            else
-                res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), parent);
+            throw std::runtime_error{
+              std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
+              static_cast<std::string>(col1) + "\n"};
+        }
 
-            if (res.ec != std::errc{})
-            {
-                throw std::runtime_error{
-                  std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} + strbuf + "\n"};
-            }
+        /* second column (parent id) */
+        uint32_t         parent = 0;
+        std::string_view col2   = record.fields[2]; // fields[1] is '|'
+        auto             res2   = std::from_chars(col2.data(), col2.data() + col2.size(), parent);
+        if (res2.ec != std::errc{})
+        {
+            throw std::runtime_error{
+              std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
+              static_cast<std::string>(col2) + "\n"};
         }
+
         if (std::ranges::size(taxonParentIDs) <= n)
             taxonParentIDs.resize(n + 1, 0);
         taxonParentIDs[n] = parent;
@@ -545,54 +542,37 @@ auto parseAndStoreTaxTree(std::vector<bool> & taxIdIsPresent, LambdaIndexerOptio
 
     start = sysTime();
 
-    std::regex const wordRegEx{R"([\w.,\"<> ]+)"};
-    std::string      name;
-
-    //TODO it would be better to do TSV reading here instead of the regex-voodoo, but I need to understand it first o_O
-    bio::io::txt::reader reader2{options.taxDumpDir + "/names.dmp"};
-    for (std::string_view line : reader2)
+    bio::io::txt::reader reader2{options.taxDumpDir + "/names.dmp", '\t'};
+    for (auto & record : reader2)
     {
-        uint32_t taxId = 0;
+        assert(record.fields.size() == 8);
 
-        auto itWord = std::cregex_iterator(line.begin(), line.end(), wordRegEx);
-        if (itWord == std::cregex_iterator())
-        {
-            throw std::runtime_error("Error: Expected taxonomical ID in first column, but couldn't find it.\n");
-        }
-        else
-        {
-            std::string            strbuf = itWord->str();
-            std::from_chars_result res;
-
-            res = std::from_chars(strbuf.data(), strbuf.data() + strbuf.size(), taxId);
+        if (record.fields.size() < 6)
+            continue;
 
-            if (res.ec != std::errc{})
+        if (record.fields[6] == "scientific name")
+        {
+            /* first column */
+            uint32_t         taxId = 0;
+            std::string_view col1  = record.fields[0];
+            auto             res1  = std::from_chars(col1.data(), col1.data() + col1.size(), taxId);
+            if (res1.ec != std::errc{})
             {
                 throw std::runtime_error{
-                  std::string{"Error: Expected taxonomical ID in first column, but got something I couldn't read: "} +
-                  strbuf + "\n"};
+                  std::string{"Error: Expected taxonomical ID, but got something I couldn't read: "} +
+                  static_cast<std::string>(col1) + "\n"};
             }
 
             if (taxId >= std::ranges::size(taxonNames))
             {
                 throw std::runtime_error(std::string("Error: taxonomical ID is ") + std::to_string(taxId) +
                                          ", but no such taxon in tree.\n");
             }
-        }
-
-        // we don't need this name
-        if (!taxIdIsPresentOrParent[taxId])
-            continue;
 
-        if (++itWord == std::cregex_iterator())
-            throw std::runtime_error("Error: Expected name in second column, but couldn't find it.\n");
-        else
-            name = itWord->str();
-
-        while (++itWord != std::cregex_iterator())
-        {
-            if (itWord->str() == "scientific name")
-                taxonNames[taxId] = name;
+            /* second column (name) */
+            if (taxIdIsPresentOrParent[taxId]) // check if we need the name
+                taxonNames[taxId].assign(record.fields[2].begin(),
+                                         record.fields[2].end()); // fields[1] is '|' separator
         }
     }
 
 
@@ -92,24 +92,22 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
 
     parser.add_section("Input Options");
 
-    // TODO Change file extensions, make more generic
+    std::vector<std::string> extensions{"fa", "fq", "fasta", "fastq"};
+#ifdef SEQAN_HAS_ZLIB
+    for (auto const & ext : extensions)
+        extensions.push_back(ext + ".gz");
+#endif
     parser.add_option(options.dbFile,
                       sharg::config{.short_id    = 'd',
                                     .long_id     = "database",
                                     .description = "Database sequences.",
                                     .required    = true,
-                                    .validator   = sharg::input_file_validator{{"fa", "fq", "fasta", "fastq", "gz"}}});
+                                    .validator   = sharg::input_file_validator{extensions}});
 
-    std::vector<std::string> taxExtensions{"accession2taxid", "dat"};
+    extensions = {"accession2taxid", "dat"};
 #ifdef SEQAN_HAS_ZLIB
-    taxExtensions.push_back("accession2taxid.gz");
-    taxExtensions.push_back("accession2taxid.bgzf");
-    taxExtensions.push_back("dat.gz");
-    taxExtensions.push_back("dat.bgzf");
-#endif
-#ifdef SEQAN_HAS_BZIP2
-    taxExtensions.push_back("accession2taxid.bz2");
-    taxExtensions.push_back("dat.bz2");
+    for (auto const & ext : extensions)
+        extensions.push_back(ext + ".gz");
 #endif
 
     parser.add_option(
@@ -120,7 +118,7 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
                       "An NCBI or UniProt accession-to-taxid mapping file. Download from "
                       "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/ or "
                       "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/ .",
-                    .validator = sharg::input_file_validator(taxExtensions)});
+                    .validator = sharg::input_file_validator{extensions}});
 
     parser.add_option(options.taxDumpDir,
                       sharg::config{.short_id    = 'x',
@@ -132,13 +130,18 @@ void parseCommandLine(LambdaIndexerOptions & options, int argc, char const ** ar
     parser.add_section("Output Options");
 
     options.indexFilePath = "»INPUT«.lba";
-    parser.add_option(options.indexFilePath,
-                      sharg::config{
-                        .short_id    = 'i',
-                        .long_id     = "index",
-                        .description = "The output path for the index file.",
-                        .validator   = sharg::output_file_validator{sharg::output_file_open_options::create_new,
-                                                                    {"lba", "lta", "lba.gz", "lta.gz"}}
+    extensions            = {"lba", "lta"};
+#ifdef SEQAN_HAS_ZLIB
+    for (auto const & ext : extensions)
+        extensions.push_back(ext + ".gz");
+#endif
+    parser.add_option(
+      options.indexFilePath,
+      sharg::config{
+        .short_id    = 'i',
+        .long_id     = "index",
+        .description = "The output path for the index file.",
+        .validator   = sharg::output_file_validator{sharg::output_file_open_options::create_new, {extensions}}
     });
 
     options.threads = std::max<size_t>(2ul, std::min<size_t>(std::thread::hardware_concurrency(), 4ul));
 
@@ -412,7 +412,7 @@ void realMain(LambdaOptions const & options)
 #ifdef LAMBDA_MICRO_STATS
             double buf = sysTime();
 #endif
-            search(localHolder); //TODO seed refining if iterateMatches gives 0 results
+            search(localHolder);
 #ifdef LAMBDA_MICRO_STATS
             localHolder.stats.timeSearch += sysTime() - buf;
 #endif
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,6 @@ void argConv3a(LambdaIndexerOptions const & options)`
`156`	`156`	`return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::MURPHY10>(options);`
`157`	`157`	`case AlphabetEnum::LI10:`
`158`	`158`	`return realMain<c_indexType, c_origAlph, c_transAlph, AlphabetEnum::LI10>(options);`
`159`		`- //TODO other reduced alphabets`
`160`	`159`	`default:`
`161`	`160`	`throw 45;`
`162`	`161`	`}`