diff --git a/include/fpgalign/search/search.hpp b/include/fpgalign/search/search.hpp index 9ffe527..5425227 100644 --- a/include/fpgalign/search/search.hpp +++ b/include/fpgalign/search/search.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -19,8 +20,44 @@ struct hit std::vector bins; }; +struct wip_alignment +{ + size_t bin; + size_t sequence_number; + size_t position; + std::vector seq; + std::vector ref; + std::string id; +}; + +class alignment_vector +{ +private: + std::vector data; + std::mutex mtx; + +public: + std::vector & get() noexcept + { + return data; + } + + std::vector const & get() const noexcept + { + return data; + } + + void emplace_back(wip_alignment elem) + { + std::lock_guard guard{mtx}; + data.emplace_back(std::move(elem)); + } + + void emplace_back(wip_alignment & elem) = delete; +}; + void search(config const & config); std::vector ibf(config const & config, size_t & todo_bin_count); -void fmindex(config const & config, std::vector hits, size_t const todo_bin_count); +std::vector fmindex(config const & config, std::vector hits, size_t const todo_bin_count); } // namespace search diff --git a/src/argument_parsing.cpp b/src/argument_parsing.cpp index 620e5f1..38c0bbf 100644 --- a/src/argument_parsing.cpp +++ b/src/argument_parsing.cpp @@ -25,7 +25,8 @@ config parse_arguments(sharg::parser & parser) .long_id = "output", .description = "Output path", .required = true, - .validator = sharg::output_file_validator{}}); // .ibf and .fmindex + .validator = sharg::output_file_validator{ + sharg::output_file_open_options::open_or_create}}); // .ibf and .fmindex parser.add_option(config.threads, sharg::config{.short_id = '\0', .long_id = "threads", @@ -86,12 +87,13 @@ config parse_arguments(sharg::parser & parser) .description = "Query path", .required = true, .validator = sharg::input_file_validator{}}); - parser.add_option(config.output_path, - sharg::config{.short_id = '\0', - .long_id = "output", - .description = "Output path", - .required = true, - .validator = sharg::output_file_validator{}}); + parser.add_option( + config.output_path, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "Output path", + .required = true, + .validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}}); parser.add_option(config.threads, sharg::config{.short_id = '\0', .long_id = "threads", diff --git a/src/build/fmindex.cpp b/src/build/fmindex.cpp index 0f5bcc8..01c360f 100644 --- a/src/build/fmindex.cpp +++ b/src/build/fmindex.cpp @@ -34,7 +34,7 @@ std::vector> read_reference(std::vector const | std::views::transform( [](auto const & in) { - return seqan3::to_rank(in); + return seqan3::to_rank(in) + 1u; }), std::back_inserter(reference.back())); } @@ -48,7 +48,7 @@ void fmindex(config const & config) for (auto && [id, bin_paths] : seqan::stl::views::enumerate(parse_input(config))) { auto reference = read_reference(bin_paths); - fmc::BiFMIndex<4> index{reference, /*samplingRate*/ 16, config.threads}; + fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads}; { std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), id), std::ios::binary}; diff --git a/src/search/fmindex.cpp b/src/search/fmindex.cpp index 194e868..330b561 100644 --- a/src/search/fmindex.cpp +++ b/src/search/fmindex.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #include +#include #include @@ -17,9 +18,25 @@ namespace search { -fmc::BiFMIndex<4> load_index(config const & config, size_t const id) +template +auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector { - fmc::BiFMIndex<4> index{}; + //TODO: möglicher weise ist das identisch zu einfach index.C[1] + auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0]; + for (size_t i{0}; i < totalNumberOfSeq; ++i) + { + auto idx = std::get<0>(std::get<0>(index.locate(i))); + if (idx == seqNbr) + { + return reconstructText(index, i); + } + } + throw std::runtime_error{"unknown sequence number"}; +} + +fmc::BiFMIndex<5> load_index(config const & config, size_t const id) +{ + fmc::BiFMIndex<5> index{}; { std::ifstream os{fmt::format("{}.{}.fmindex", config.input_path.c_str(), id), std::ios::binary}; @@ -27,65 +44,95 @@ fmc::BiFMIndex<4> load_index(config const & config, size_t const id) iarchive(index); } + // { + // fmt::println(" === Reconstruct all ==="); + // auto text = fmc::reconstructText(index); + // for (size_t i = 0; i < text.size(); ++i) + // fmt::print("### {} ###\n{}\n", i, fmt::join(text[i], "")); + // } + // { + // fmt::println(" === Reconstruct single ==="); + // auto text = fmc::reconstructText(index, 0); + // fmt::print("### 0 ###\n{}\n", fmt::join(text, "")); + // text = fmc::reconstructText(index, 1); + // fmt::print("### 1 ###\n{}\n", fmt::join(text, "")); + // } + // { + // fmt::println(" === Reconstruct alternative ==="); + // auto text = myReconstruct(index, 0); + // fmt::print("### 0 ###\n{}\n", fmt::join(text, "")); + // text = myReconstruct(index, 1); + // fmt::print("### 1 ###\n{}\n", fmt::join(text, "")); + // } + return index; } -void fmindex(config const & config, std::vector hits, size_t const todo_bin_count) +std::vector fmindex(config const & config, std::vector hits, size_t const todo_bin_count) { // todo bin count // todo capacity // each slot = 1 bin // a cart is full if it has 5 elements (hits) - scq::slotted_cart_queue queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}}; - size_t thread_id{}; - - auto get_thread = [&]() + alignment_vector res; { - return std::jthread( - [&, thread_id = thread_id++]() - { - while (true) + scq::slotted_cart_queue queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}}; + size_t thread_id{}; + + auto get_thread = [&]() + { + return std::jthread( + [&, thread_id = thread_id++]() { - scq::cart_future cart = queue.dequeue(); - if (!cart.valid()) - return; - auto [slot, span] = cart.get(); - auto index = load_index(config, slot.value); - for (auto idx : span) + while (true) { - auto & [id, seq, bins] = hits[idx]; - - auto callback = [&](auto cursor, size_t) + scq::cart_future cart = queue.dequeue(); + if (!cart.valid()) + return; + auto [slot, span] = cart.get(); + auto index = load_index(config, slot.value); + for (auto idx : span) { - for (auto j : cursor) + auto & [id, seq, bins] = hits[idx]; + + auto callback = [&](auto cursor, size_t) { - auto [entry, offset] = index.locate(j); - auto [seqId, pos] = entry; + for (auto j : cursor) { - fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n", - thread_id, - id, - slot.value, - seqId, - pos + offset); + auto [entry, offset] = index.locate(j); + auto [seqId, pos] = entry; + // fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n", + // thread_id, + // id, + // slot.value, + // seqId, + // pos + offset); + res.emplace_back(wip_alignment{.bin = slot.value, + .sequence_number = seqId, + .position = pos + offset, + .seq = seq, + .ref = myReconstruct(index, seqId), + .id = id}); // todo seq is copied } - } - }; + }; - fmc::search(index, seq, config.errors, callback); + fmc::search(index, seq, config.errors, callback); + } } - } - }); - }; + }); + }; + + std::vector worker(config.threads); + std::ranges::generate(worker, get_thread); - std::vector worker(config.threads); - std::ranges::generate(worker, get_thread); + for (auto && [idx, hit] : seqan::stl::views::enumerate(hits)) + for (auto bin : hit.bins) + queue.enqueue(scq::slot_id{bin}, idx); - for (auto && [idx, hit] : seqan::stl::views::enumerate(hits)) - for (auto bin : hit.bins) - queue.enqueue(scq::slot_id{bin}, idx); + queue.close(); + } // Wait for threads to finish - queue.close(); + return res.get(); } } // namespace search diff --git a/src/search/ibf.cpp b/src/search/ibf.cpp index b215e2e..824c407 100644 --- a/src/search/ibf.cpp +++ b/src/search/ibf.cpp @@ -90,7 +90,7 @@ std::vector ibf(config const & config, size_t & todo_bin_count) | std::views::transform( [](auto const & in) { - return seqan3::to_rank(in); + return seqan3::to_rank(in) + 1u; }), std::back_inserter(hits[i].seq)); std::ranges::copy(result, std::back_inserter(hits[i].bins)); diff --git a/src/search/search.cpp b/src/search/search.cpp index 42516b9..bdae45f 100644 --- a/src/search/search.cpp +++ b/src/search/search.cpp @@ -2,16 +2,88 @@ // SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include + #include namespace search { +auto format_as(wip_alignment wip) +{ + return fmt::format("(bin = {}, seqNo = {}, pos = {}, seq = {:d}, ref = {:d})", + wip.bin, + wip.sequence_number, + wip.position, + fmt::join(wip.seq, ""), + fmt::join(wip.ref, "")); +} + +void do_alignment(config const & config, std::vector const & wips) +{ + seqan3::sam_file_output sam_out{config.output_path, + seqan3::fields{}}; + + seqan3::configuration const align_config = + seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true}, + seqan3::align_cfg::free_end_gaps_sequence2_leading{false}, + seqan3::align_cfg::free_end_gaps_sequence1_trailing{true}, + seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}} + | seqan3::align_cfg::edit_scheme | seqan3::align_cfg::output_alignment{} + | seqan3::align_cfg::output_begin_position{} | seqan3::align_cfg::output_score{}; + + for (auto & wip : wips) + { + size_t const start = wip.position - static_cast(wip.position != 0u); + size_t const length = wip.seq.size(); + auto it = std::ranges::next(wip.ref.begin(), start, wip.ref.end()); + auto end = std::ranges::next(it, length + 1u, wip.ref.end()); + std::span ref_text{it, end}; + + for (auto && alignment : seqan3::align_pairwise(std::tie(ref_text, wip.seq), align_config)) + { + auto cigar = seqan3::cigar_from_alignment(alignment.alignment()); + size_t ref_offset = alignment.sequence1_begin_position() + 2 + start; + size_t map_qual = 60u + alignment.score(); + + sam_out.emplace_back(std::views::transform(wip.seq, + [](auto const in) + { + return seqan3::dna4{}.assign_rank( + static_cast(in - 1u)); + }), + wip.id, + fmt::format("{}", wip.sequence_number), // todo ref storage + ref_offset, + cigar, + // record.base_qualities(), + map_qual); + } + } +} + void search(config const & config) { size_t todo_bin_count{}; std::vector hits = ibf(config, todo_bin_count); - fmindex(config, std::move(hits), todo_bin_count); + auto const res = fmindex(config, std::move(hits), todo_bin_count); + do_alignment(config, res); + // for (auto const & elem : res) + // { + // fmt::print("{}\n", elem); + // } } } // namespace search