diff --git a/include/fpgalign/build/build.hpp b/include/fpgalign/build/build.hpp index 4e7e5cf..c2b2311 100644 --- a/include/fpgalign/build/build.hpp +++ b/include/fpgalign/build/build.hpp @@ -8,6 +8,7 @@ #include #include +#include namespace build { @@ -15,7 +16,7 @@ namespace build std::vector> parse_input(config const & config); void build(config const & config); -void ibf(config const & config); -void fmindex(config const & config); +void ibf(config const & config, meta & meta); +void fmindex(config const & config, meta & meta); } // namespace build diff --git a/include/fpgalign/meta.hpp b/include/fpgalign/meta.hpp new file mode 100644 index 0000000..8e2065b --- /dev/null +++ b/include/fpgalign/meta.hpp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +#include + +#include +#include + +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +using seqfile_t = seqan3::sequence_file_input>; +using record_t = typename seqfile_t::record_type; + +struct meta +{ + uint8_t kmer_size{}; + uint32_t window_size{}; + size_t number_of_bins{}; + std::vector> bin_paths; + std::vector> ref_ids; + std::vector>> references; + std::vector queries; + + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) + { + archive(kmer_size); + archive(window_size); + archive(number_of_bins); + archive(ref_ids); + } +}; diff --git a/include/fpgalign/search/search.hpp b/include/fpgalign/search/search.hpp index 5425227..ab25f91 100644 --- a/include/fpgalign/search/search.hpp +++ b/include/fpgalign/search/search.hpp @@ -9,14 +9,13 @@ #include #include +#include namespace search { struct hit { - std::string id; - std::vector seq; std::vector bins; }; @@ -25,9 +24,7 @@ struct wip_alignment size_t bin; size_t sequence_number; size_t position; - std::vector seq; - std::vector ref; - std::string id; + size_t idx; }; class alignment_vector @@ -57,7 +54,7 @@ class alignment_vector }; void search(config const & config); -std::vector ibf(config const & config, size_t & todo_bin_count); -std::vector fmindex(config const & config, std::vector hits, size_t const todo_bin_count); +std::vector ibf(config const & config, meta & meta); +std::vector fmindex(config const & config, meta & meta, std::vector hits); } // namespace search diff --git a/src/argument_parsing.cpp b/src/argument_parsing.cpp index 38c0bbf..63590b6 100644 --- a/src/argument_parsing.cpp +++ b/src/argument_parsing.cpp @@ -98,18 +98,6 @@ config parse_arguments(sharg::parser & parser) sharg::config{.short_id = '\0', .long_id = "threads", .description = "The number of threads to use."}); // positive_integer_validator - - parser.add_subsection("k-mer options"); - parser.add_option(config.kmer_size, - sharg::config{.short_id = '\0', - .long_id = "kmer", - .description = "The k-mer size.", - .validator = sharg::arithmetic_range_validator{1, 32}}); - parser.add_option(config.window_size, - sharg::config{.short_id = '\0', - .long_id = "window", - .description = "The window size.", - .default_message = "k-mer size"}); parser.add_option(config.errors, sharg::config{.short_id = '\0', .long_id = "errors", @@ -118,11 +106,6 @@ config parse_arguments(sharg::parser & parser) parser.parse(); - if (parser.is_option_set("kmer") && !parser.is_option_set("window")) - config.window_size = config.kmer_size; - else if (config.window_size < config.kmer_size) - throw sharg::validation_error{"k-mer size cannot be smaller than window size!"}; - return config; } diff --git a/src/build/build.cpp b/src/build/build.cpp index 2d8e3d9..3ae44e6 100644 --- a/src/build/build.cpp +++ b/src/build/build.cpp @@ -2,10 +2,15 @@ // SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: BSD-3-Clause +#include #include #include +#include + +#include #include +#include namespace build { @@ -37,8 +42,19 @@ std::vector> parse_input(config const & config) void build(config const & config) { - build::ibf(config); - build::fmindex(config); + meta meta{}; + meta.bin_paths = parse_input(config); + meta.number_of_bins = meta.bin_paths.size(); + build::ibf(config, meta); + assert(meta.kmer_size == config.kmer_size); + assert(meta.window_size == config.window_size); + build::fmindex(config, meta); + + { + std::ofstream os{fmt::format("{}.meta", config.output_path.c_str()), std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(meta); + } } } // namespace build diff --git a/src/build/fmindex.cpp b/src/build/fmindex.cpp index 01c360f..ac2231d 100644 --- a/src/build/fmindex.cpp +++ b/src/build/fmindex.cpp @@ -6,8 +6,6 @@ #include -#include - #include #include @@ -20,15 +18,17 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna using sequence_alphabet = seqan3::dna4; }; -std::vector> read_reference(std::vector const & bin_paths) +void read_reference_into(std::vector> & reference, meta & meta, size_t const i) { - std::vector> reference{}; + reference.clear(); - for (auto const & bin_path : bin_paths) + for (auto const & bin_path : meta.bin_paths[i]) { - seqan3::sequence_file_input> fin{bin_path}; + seqan3::sequence_file_input> fin{bin_path}; + for (auto && record : fin) { + meta.ref_ids[i].push_back(record.id()); reference.push_back({}); std::ranges::copy(record.sequence() | std::views::transform( @@ -39,21 +39,34 @@ std::vector> read_reference(std::vector const std::back_inserter(reference.back())); } } - - return reference; } -void fmindex(config const & config) +void fmindex(config const & config, meta & meta) { - for (auto && [id, bin_paths] : seqan::stl::views::enumerate(parse_input(config))) + meta.ref_ids.resize(meta.number_of_bins); + +#pragma omp parallel num_threads(config.threads) { - auto reference = read_reference(bin_paths); - fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads}; + std::vector> reference; +#pragma omp for + for (size_t i = 0; i < meta.number_of_bins; ++i) { - std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), id), std::ios::binary}; - cereal::BinaryOutputArchive oarchive{os}; - oarchive(index); + read_reference_into(reference, meta, i); + + fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads}; + + { + std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), i), std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(index); + } + + { + std::ofstream os{fmt::format("{}.{}.ref", config.output_path.c_str(), i), std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(reference); + } } } } diff --git a/src/build/ibf.cpp b/src/build/ibf.cpp index 4da2193..569ad8c 100644 --- a/src/build/ibf.cpp +++ b/src/build/ibf.cpp @@ -24,9 +24,10 @@ static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); } -void ibf(config const & config) +void ibf(config const & config, meta & meta) { - auto const bin_paths = parse_input(config); + meta.kmer_size = config.kmer_size; + meta.window_size = config.window_size; auto get_user_bin_data = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) { @@ -36,7 +37,7 @@ void ibf(config const & config) seqan3::window_size{config.window_size}, seqan3::seed{adjust_seed(config.kmer_size)}); - for (auto && bin_path : bin_paths[user_bin_id]) + for (auto && bin_path : meta.bin_paths[user_bin_id]) { sequence_file_t fin{bin_path}; for (auto && record : fin) @@ -58,7 +59,7 @@ void ibf(config const & config) }; seqan::hibf::config ibf_config{.input_fn = get_user_bin_data, - .number_of_user_bins = bin_paths.size(), + .number_of_user_bins = meta.bin_paths.size(), .number_of_hash_functions = config.hash_count, .maximum_fpr = config.fpr, .threads = config.threads}; diff --git a/src/search/fmindex.cpp b/src/search/fmindex.cpp index 330b561..60000b7 100644 --- a/src/search/fmindex.cpp +++ b/src/search/fmindex.cpp @@ -18,21 +18,21 @@ namespace search { -template -auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector -{ - //TODO: möglicher weise ist das identisch zu einfach index.C[1] - auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0]; - for (size_t i{0}; i < totalNumberOfSeq; ++i) - { - auto idx = std::get<0>(std::get<0>(index.locate(i))); - if (idx == seqNbr) - { - return reconstructText(index, i); - } - } - throw std::runtime_error{"unknown sequence number"}; -} +// template +// auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector +// { +// //TODO: möglicher weise ist das identisch zu einfach index.C[1] +// auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0]; +// for (size_t i{0}; i < totalNumberOfSeq; ++i) +// { +// auto idx = std::get<0>(std::get<0>(index.locate(i))); +// if (idx == seqNbr) +// { +// return reconstructText(index, i); +// } +// } +// throw std::runtime_error{"unknown sequence number"}; +// } fmc::BiFMIndex<5> load_index(config const & config, size_t const id) { @@ -68,7 +68,7 @@ fmc::BiFMIndex<5> load_index(config const & config, size_t const id) return index; } -std::vector fmindex(config const & config, std::vector hits, size_t const todo_bin_count) +std::vector fmindex(config const & config, meta & meta, std::vector hits) { // todo bin count // todo capacity @@ -76,7 +76,8 @@ std::vector fmindex(config const & config, std::vector hits, // a cart is full if it has 5 elements (hits) alignment_vector res; { - scq::slotted_cart_queue queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}}; + scq::slotted_cart_queue queue{ + {.slots = meta.number_of_bins, .carts = meta.number_of_bins, .capacity = 5}}; size_t thread_id{}; auto get_thread = [&]() @@ -93,30 +94,26 @@ std::vector fmindex(config const & config, std::vector hits, auto index = load_index(config, slot.value); for (auto idx : span) { - auto & [id, seq, bins] = hits[idx]; - auto callback = [&](auto cursor, size_t) { for (auto j : cursor) { auto [entry, offset] = index.locate(j); auto [seqId, pos] = entry; - // fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n", - // thread_id, - // id, - // slot.value, - // seqId, - // pos + offset); res.emplace_back(wip_alignment{.bin = slot.value, .sequence_number = seqId, .position = pos + offset, - .seq = seq, - .ref = myReconstruct(index, seqId), - .id = id}); // todo seq is copied + .idx = idx}); } }; - fmc::search(index, seq, config.errors, callback); + auto seq_view = std::views::transform(meta.queries[idx].sequence(), + [](seqan3::dna4 const in) -> uint8_t + { + return in.to_rank() + 1u; + }); + + fmc::search(index, seq_view, config.errors, callback); } } }); diff --git a/src/search/ibf.cpp b/src/search/ibf.cpp index 824c407..a99d7be 100644 --- a/src/search/ibf.cpp +++ b/src/search/ibf.cpp @@ -26,7 +26,7 @@ static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); } -threshold::threshold get_thresholder(config const & config) +threshold::threshold get_thresholder(config const & config, meta const & meta) { size_t const first_sequence_size = [&]() { @@ -35,8 +35,8 @@ threshold::threshold get_thresholder(config const & config) return record.sequence().size(); }(); - return {threshold::threshold_parameters{.window_size = config.window_size, - .shape = seqan3::ungapped{config.kmer_size}, + return {threshold::threshold_parameters{.window_size = meta.window_size, + .shape = seqan3::ungapped{meta.kmer_size}, .query_length = first_sequence_size, .errors = config.errors}}; } @@ -44,7 +44,7 @@ threshold::threshold get_thresholder(config const & config) using seqfile_t = seqan3::sequence_file_input>; using record_t = typename seqfile_t::record_type; -std::vector ibf(config const & config, size_t & todo_bin_count) +std::vector ibf(config const & config, meta & meta) { seqan::hibf::interleaved_bloom_filter ibf{}; @@ -53,9 +53,9 @@ std::vector ibf(config const & config, size_t & todo_bin_count) cereal::BinaryInputArchive iarchive{os}; iarchive(ibf); } - todo_bin_count = ibf.bin_count(); + assert(ibf.bin_count() == meta.number_of_bins); - std::vector records = [&]() + meta.queries = [&]() { std::vector result{}; seqfile_t fin{config.query_path}; @@ -65,49 +65,30 @@ std::vector ibf(config const & config, size_t & todo_bin_count) return result; }(); - std::vector hits(records.size()); + std::vector hits(meta.queries.size()); #pragma omp parallel num_threads(config.threads) { auto agent = ibf.membership_agent(); - threshold::threshold const thresholder = get_thresholder(config); - auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{config.kmer_size}, - seqan3::window_size{config.window_size}, - seqan3::seed{adjust_seed(config.kmer_size)}); + threshold::threshold const thresholder = get_thresholder(config, meta); + auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{meta.kmer_size}, + seqan3::window_size{meta.window_size}, + seqan3::seed{adjust_seed(meta.kmer_size)}); std::vector hashes; #pragma omp for - for (size_t i = 0; i < records.size(); ++i) + for (size_t i = 0; i < meta.queries.size(); ++i) { - auto & [id, seq] = records[i]; + auto & [id, seq] = meta.queries[i]; auto view = seq | minimiser_view | std::views::common; hashes.clear(); hashes.assign(view.begin(), view.end()); auto & result = agent.membership_for(hashes, thresholder.get(hashes.size())); - hits[i].id = std::move(id); - std::ranges::copy(std::move(seq) - | std::views::transform( - [](auto const & in) - { - return seqan3::to_rank(in) + 1u; - }), - std::back_inserter(hits[i].seq)); std::ranges::copy(result, std::back_inserter(hits[i].bins)); } } - for (auto & hit : hits) - { - std::cout << hit.id << ": "; - for (auto chr : hit.seq) - std::cout << static_cast(chr); - std::cout << "\n "; - for (auto bin : hit.bins) - std::cout << bin << ','; - std::cout << '\n'; - } - return hits; } diff --git a/src/search/search.cpp b/src/search/search.cpp index bdae45f..bdde0e3 100644 --- a/src/search/search.cpp +++ b/src/search/search.cpp @@ -15,17 +15,17 @@ namespace search { -auto format_as(wip_alignment wip) -{ - return fmt::format("(bin = {}, seqNo = {}, pos = {}, seq = {:d}, ref = {:d})", - wip.bin, - wip.sequence_number, - wip.position, - fmt::join(wip.seq, ""), - fmt::join(wip.ref, "")); -} +// auto format_as(wip_alignment wip) +// { +// return fmt::format("(bin = {}, seqNo = {}, pos = {}, seq = {:d}, ref = {:d})", +// wip.bin, +// wip.sequence_number, +// wip.position, +// fmt::join(wip.seq, ""), +// fmt::join(wip.ref, "")); +// } -void do_alignment(config const & config, std::vector const & wips) +void do_alignment(config const & config, meta & meta, std::vector const & wips) { seqan3::sam_file_output sam_out{config.output_path, seqan3::fields const & wips | seqan3::align_cfg::edit_scheme | seqan3::align_cfg::output_alignment{} | seqan3::align_cfg::output_begin_position{} | seqan3::align_cfg::output_score{}; - for (auto & wip : wips) + for (auto [bin, sequence_number, position, idx] : wips) { - size_t const start = wip.position - static_cast(wip.position != 0u); - size_t const length = wip.seq.size(); - auto it = std::ranges::next(wip.ref.begin(), start, wip.ref.end()); - auto end = std::ranges::next(it, length + 1u, wip.ref.end()); + auto & seq = meta.queries[idx].sequence(); + auto seq_view = std::views::transform(seq, + [](seqan3::dna4 const in) -> uint8_t + { + return in.to_rank() + 1u; + }); + auto & seq_id = meta.queries[idx].id(); + auto & ref = meta.references[bin][sequence_number]; + auto & ref_id = meta.ref_ids[bin][sequence_number]; + + size_t const start = position - static_cast(position != 0u); + size_t const length = seq.size(); + auto it = std::ranges::next(ref.begin(), start, ref.end()); + auto end = std::ranges::next(it, length + 1u, ref.end()); std::span ref_text{it, end}; - for (auto && alignment : seqan3::align_pairwise(std::tie(ref_text, wip.seq), align_config)) + for (auto && alignment : seqan3::align_pairwise(std::tie(ref_text, seq_view), align_config)) { auto cigar = seqan3::cigar_from_alignment(alignment.alignment()); size_t ref_offset = alignment.sequence1_begin_position() + 2 + start; size_t map_qual = 60u + alignment.score(); - sam_out.emplace_back(std::views::transform(wip.seq, - [](auto const in) - { - return seqan3::dna4{}.assign_rank( - static_cast(in - 1u)); - }), - wip.id, - fmt::format("{}", wip.sequence_number), // todo ref storage + sam_out.emplace_back(seq, + seq_id, + ref_id, ref_offset, cigar, // record.base_qualities(), @@ -76,10 +81,25 @@ void do_alignment(config const & config, std::vector const & wips void search(config const & config) { - size_t todo_bin_count{}; - std::vector hits = ibf(config, todo_bin_count); - auto const res = fmindex(config, std::move(hits), todo_bin_count); - do_alignment(config, res); + meta meta{}; + { + std::ifstream is{fmt::format("{}.meta", config.input_path.c_str()), std::ios::binary}; + cereal::BinaryInputArchive iarchive{is}; + iarchive(meta); + } + + meta.references.resize(meta.number_of_bins); + for (size_t i = 0; i < meta.number_of_bins; ++i) + { + + std::ifstream is{fmt::format("{}.{}.ref", config.input_path.c_str(), i), std::ios::binary}; + cereal::BinaryInputArchive iarchive{is}; + iarchive(meta.references[i]); + } + + std::vector hits = ibf(config, meta); + auto const res = fmindex(config, meta, std::move(hits)); + do_alignment(config, meta, res); // for (auto const & elem : res) // { // fmt::print("{}\n", elem);