Skip to content

Commit bda1315

Browse files
authored
Merge pull request #3 from eseiler/misc/align
First version alignment
2 parents be79fda + 518e8e3 commit bda1315

File tree

6 files changed

+211
-53
lines changed

6 files changed

+211
-53
lines changed

include/fpgalign/search/search.hpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#pragma once
66

7+
#include <mutex>
78
#include <string>
89
#include <vector>
910

@@ -19,8 +20,44 @@ struct hit
1920
std::vector<uint64_t> bins;
2021
};
2122

23+
struct wip_alignment
24+
{
25+
size_t bin;
26+
size_t sequence_number;
27+
size_t position;
28+
std::vector<uint8_t> seq;
29+
std::vector<uint8_t> ref;
30+
std::string id;
31+
};
32+
33+
class alignment_vector
34+
{
35+
private:
36+
std::vector<wip_alignment> data;
37+
std::mutex mtx;
38+
39+
public:
40+
std::vector<wip_alignment> & get() noexcept
41+
{
42+
return data;
43+
}
44+
45+
std::vector<wip_alignment> const & get() const noexcept
46+
{
47+
return data;
48+
}
49+
50+
void emplace_back(wip_alignment elem)
51+
{
52+
std::lock_guard guard{mtx};
53+
data.emplace_back(std::move(elem));
54+
}
55+
56+
void emplace_back(wip_alignment & elem) = delete;
57+
};
58+
2259
void search(config const & config);
2360
std::vector<hit> ibf(config const & config, size_t & todo_bin_count);
24-
void fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count);
61+
std::vector<wip_alignment> fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count);
2562

2663
} // namespace search

src/argument_parsing.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ config parse_arguments(sharg::parser & parser)
2525
.long_id = "output",
2626
.description = "Output path",
2727
.required = true,
28-
.validator = sharg::output_file_validator{}}); // .ibf and .fmindex
28+
.validator = sharg::output_file_validator{
29+
sharg::output_file_open_options::open_or_create}}); // .ibf and .fmindex
2930
parser.add_option(config.threads,
3031
sharg::config{.short_id = '\0',
3132
.long_id = "threads",
@@ -86,12 +87,13 @@ config parse_arguments(sharg::parser & parser)
8687
.description = "Query path",
8788
.required = true,
8889
.validator = sharg::input_file_validator{}});
89-
parser.add_option(config.output_path,
90-
sharg::config{.short_id = '\0',
91-
.long_id = "output",
92-
.description = "Output path",
93-
.required = true,
94-
.validator = sharg::output_file_validator{}});
90+
parser.add_option(
91+
config.output_path,
92+
sharg::config{.short_id = '\0',
93+
.long_id = "output",
94+
.description = "Output path",
95+
.required = true,
96+
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}});
9597
parser.add_option(config.threads,
9698
sharg::config{.short_id = '\0',
9799
.long_id = "threads",

src/build/fmindex.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ std::vector<std::vector<uint8_t>> read_reference(std::vector<std::string> const
3434
| std::views::transform(
3535
[](auto const & in)
3636
{
37-
return seqan3::to_rank(in);
37+
return seqan3::to_rank(in) + 1u;
3838
}),
3939
std::back_inserter(reference.back()));
4040
}
@@ -48,7 +48,7 @@ void fmindex(config const & config)
4848
for (auto && [id, bin_paths] : seqan::stl::views::enumerate(parse_input(config)))
4949
{
5050
auto reference = read_reference(bin_paths);
51-
fmc::BiFMIndex<4> index{reference, /*samplingRate*/ 16, config.threads};
51+
fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads};
5252

5353
{
5454
std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), id), std::ios::binary};

src/search/fmindex.cpp

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// SPDX-License-Identifier: BSD-3-Clause
44

55
#include <fmt/format.h>
6+
#include <fmt/ranges.h>
67

78
#include <seqan3/io/sequence_file/input.hpp>
89

@@ -17,75 +18,121 @@
1718
namespace search
1819
{
1920

20-
fmc::BiFMIndex<4> load_index(config const & config, size_t const id)
21+
template <typename Index>
22+
auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector<uint8_t>
2123
{
22-
fmc::BiFMIndex<4> index{};
24+
//TODO: möglicher weise ist das identisch zu einfach index.C[1]
25+
auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0];
26+
for (size_t i{0}; i < totalNumberOfSeq; ++i)
27+
{
28+
auto idx = std::get<0>(std::get<0>(index.locate(i)));
29+
if (idx == seqNbr)
30+
{
31+
return reconstructText(index, i);
32+
}
33+
}
34+
throw std::runtime_error{"unknown sequence number"};
35+
}
36+
37+
fmc::BiFMIndex<5> load_index(config const & config, size_t const id)
38+
{
39+
fmc::BiFMIndex<5> index{};
2340

2441
{
2542
std::ifstream os{fmt::format("{}.{}.fmindex", config.input_path.c_str(), id), std::ios::binary};
2643
cereal::BinaryInputArchive iarchive{os};
2744
iarchive(index);
2845
}
2946

47+
// {
48+
// fmt::println(" === Reconstruct all ===");
49+
// auto text = fmc::reconstructText(index);
50+
// for (size_t i = 0; i < text.size(); ++i)
51+
// fmt::print("### {} ###\n{}\n", i, fmt::join(text[i], ""));
52+
// }
53+
// {
54+
// fmt::println(" === Reconstruct single ===");
55+
// auto text = fmc::reconstructText(index, 0);
56+
// fmt::print("### 0 ###\n{}\n", fmt::join(text, ""));
57+
// text = fmc::reconstructText(index, 1);
58+
// fmt::print("### 1 ###\n{}\n", fmt::join(text, ""));
59+
// }
60+
// {
61+
// fmt::println(" === Reconstruct alternative ===");
62+
// auto text = myReconstruct(index, 0);
63+
// fmt::print("### 0 ###\n{}\n", fmt::join(text, ""));
64+
// text = myReconstruct(index, 1);
65+
// fmt::print("### 1 ###\n{}\n", fmt::join(text, ""));
66+
// }
67+
3068
return index;
3169
}
3270

33-
void fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count)
71+
std::vector<wip_alignment> fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count)
3472
{
3573
// todo bin count
3674
// todo capacity
3775
// each slot = 1 bin
3876
// a cart is full if it has 5 elements (hits)
39-
scq::slotted_cart_queue<size_t> queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}};
40-
size_t thread_id{};
41-
42-
auto get_thread = [&]()
77+
alignment_vector res;
4378
{
44-
return std::jthread(
45-
[&, thread_id = thread_id++]()
46-
{
47-
while (true)
79+
scq::slotted_cart_queue<size_t> queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}};
80+
size_t thread_id{};
81+
82+
auto get_thread = [&]()
83+
{
84+
return std::jthread(
85+
[&, thread_id = thread_id++]()
4886
{
49-
scq::cart_future<size_t> cart = queue.dequeue();
50-
if (!cart.valid())
51-
return;
52-
auto [slot, span] = cart.get();
53-
auto index = load_index(config, slot.value);
54-
for (auto idx : span)
87+
while (true)
5588
{
56-
auto & [id, seq, bins] = hits[idx];
57-
58-
auto callback = [&](auto cursor, size_t)
89+
scq::cart_future<size_t> cart = queue.dequeue();
90+
if (!cart.valid())
91+
return;
92+
auto [slot, span] = cart.get();
93+
auto index = load_index(config, slot.value);
94+
for (auto idx : span)
5995
{
60-
for (auto j : cursor)
96+
auto & [id, seq, bins] = hits[idx];
97+
98+
auto callback = [&](auto cursor, size_t)
6199
{
62-
auto [entry, offset] = index.locate(j);
63-
auto [seqId, pos] = entry;
100+
for (auto j : cursor)
64101
{
65-
fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n",
66-
thread_id,
67-
id,
68-
slot.value,
69-
seqId,
70-
pos + offset);
102+
auto [entry, offset] = index.locate(j);
103+
auto [seqId, pos] = entry;
104+
// fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n",
105+
// thread_id,
106+
// id,
107+
// slot.value,
108+
// seqId,
109+
// pos + offset);
110+
res.emplace_back(wip_alignment{.bin = slot.value,
111+
.sequence_number = seqId,
112+
.position = pos + offset,
113+
.seq = seq,
114+
.ref = myReconstruct(index, seqId),
115+
.id = id}); // todo seq is copied
71116
}
72-
}
73-
};
117+
};
74118

75-
fmc::search<true>(index, seq, config.errors, callback);
119+
fmc::search<true>(index, seq, config.errors, callback);
120+
}
76121
}
77-
}
78-
});
79-
};
122+
});
123+
};
124+
125+
std::vector<std::jthread> worker(config.threads);
126+
std::ranges::generate(worker, get_thread);
80127

81-
std::vector<std::jthread> worker(config.threads);
82-
std::ranges::generate(worker, get_thread);
128+
for (auto && [idx, hit] : seqan::stl::views::enumerate(hits))
129+
for (auto bin : hit.bins)
130+
queue.enqueue(scq::slot_id{bin}, idx);
83131

84-
for (auto && [idx, hit] : seqan::stl::views::enumerate(hits))
85-
for (auto bin : hit.bins)
86-
queue.enqueue(scq::slot_id{bin}, idx);
132+
queue.close();
133+
} // Wait for threads to finish
87134

88-
queue.close();
135+
return res.get();
89136
}
90137

91138
} // namespace search

src/search/ibf.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ std::vector<hit> ibf(config const & config, size_t & todo_bin_count)
9090
| std::views::transform(
9191
[](auto const & in)
9292
{
93-
return seqan3::to_rank(in);
93+
return seqan3::to_rank(in) + 1u;
9494
}),
9595
std::back_inserter(hits[i].seq));
9696
std::ranges::copy(result, std::back_inserter(hits[i].bins));

src/search/search.cpp

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,88 @@
22
// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
33
// SPDX-License-Identifier: BSD-3-Clause
44

5+
#include <fmt/format.h>
6+
#include <fmt/ranges.h>
7+
8+
#include <seqan3/alignment/cigar_conversion/cigar_from_alignment.hpp>
9+
#include <seqan3/alignment/pairwise/align_pairwise.hpp>
10+
#include <seqan3/alphabet/nucleotide/dna4.hpp>
11+
#include <seqan3/io/sam_file/output.hpp>
12+
513
#include <fpgalign/search/search.hpp>
614

715
namespace search
816
{
917

18+
auto format_as(wip_alignment wip)
19+
{
20+
return fmt::format("(bin = {}, seqNo = {}, pos = {}, seq = {:d}, ref = {:d})",
21+
wip.bin,
22+
wip.sequence_number,
23+
wip.position,
24+
fmt::join(wip.seq, ""),
25+
fmt::join(wip.ref, ""));
26+
}
27+
28+
void do_alignment(config const & config, std::vector<wip_alignment> const & wips)
29+
{
30+
seqan3::sam_file_output sam_out{config.output_path,
31+
seqan3::fields<seqan3::field::seq,
32+
seqan3::field::id,
33+
seqan3::field::ref_id,
34+
seqan3::field::ref_offset,
35+
seqan3::field::cigar,
36+
// seqan3::field::qual,
37+
seqan3::field::mapq>{}};
38+
39+
seqan3::configuration const align_config =
40+
seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true},
41+
seqan3::align_cfg::free_end_gaps_sequence2_leading{false},
42+
seqan3::align_cfg::free_end_gaps_sequence1_trailing{true},
43+
seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}}
44+
| seqan3::align_cfg::edit_scheme | seqan3::align_cfg::output_alignment{}
45+
| seqan3::align_cfg::output_begin_position{} | seqan3::align_cfg::output_score{};
46+
47+
for (auto & wip : wips)
48+
{
49+
size_t const start = wip.position - static_cast<size_t>(wip.position != 0u);
50+
size_t const length = wip.seq.size();
51+
auto it = std::ranges::next(wip.ref.begin(), start, wip.ref.end());
52+
auto end = std::ranges::next(it, length + 1u, wip.ref.end());
53+
std::span ref_text{it, end};
54+
55+
for (auto && alignment : seqan3::align_pairwise(std::tie(ref_text, wip.seq), align_config))
56+
{
57+
auto cigar = seqan3::cigar_from_alignment(alignment.alignment());
58+
size_t ref_offset = alignment.sequence1_begin_position() + 2 + start;
59+
size_t map_qual = 60u + alignment.score();
60+
61+
sam_out.emplace_back(std::views::transform(wip.seq,
62+
[](auto const in)
63+
{
64+
return seqan3::dna4{}.assign_rank(
65+
static_cast<uint8_t>(in - 1u));
66+
}),
67+
wip.id,
68+
fmt::format("{}", wip.sequence_number), // todo ref storage
69+
ref_offset,
70+
cigar,
71+
// record.base_qualities(),
72+
map_qual);
73+
}
74+
}
75+
}
76+
1077
void search(config const & config)
1178
{
1279
size_t todo_bin_count{};
1380
std::vector<hit> hits = ibf(config, todo_bin_count);
14-
fmindex(config, std::move(hits), todo_bin_count);
81+
auto const res = fmindex(config, std::move(hits), todo_bin_count);
82+
do_alignment(config, res);
83+
// for (auto const & elem : res)
84+
// {
85+
// fmt::print("{}\n", elem);
86+
// }
1587
}
1688

1789
} // namespace search

0 commit comments

Comments
 (0)