Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions include/fpgalign/build/build.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
#include <vector>

#include <fpgalign/config.hpp>
#include <fpgalign/meta.hpp>

namespace build
{

std::vector<std::vector<std::string>> parse_input(config const & config);

void build(config const & config);
void ibf(config const & config);
void fmindex(config const & config);
void ibf(config const & config, meta & meta);
void fmindex(config const & config, meta & meta);

} // namespace build
42 changes: 42 additions & 0 deletions include/fpgalign/meta.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <filesystem>
#include <string>
#include <vector>

#include <seqan3/io/sequence_file/input.hpp>

#include <cereal/types/string.hpp>
#include <cereal/types/vector.hpp>

struct dna4_traits : seqan3::sequence_file_input_default_traits_dna
{
using sequence_alphabet = seqan3::dna4;
};

using seqfile_t = seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>>;
using record_t = typename seqfile_t::record_type;

struct meta
{
uint8_t kmer_size{};
uint32_t window_size{};
size_t number_of_bins{};
std::vector<std::vector<std::string>> bin_paths;
std::vector<std::vector<std::string>> ref_ids;
std::vector<std::vector<std::vector<uint8_t>>> references;
std::vector<record_t> queries;

template <typename archive_t>
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
{
archive(kmer_size);
archive(window_size);
archive(number_of_bins);
archive(ref_ids);
}
};
11 changes: 4 additions & 7 deletions include/fpgalign/search/search.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,13 @@
#include <vector>

#include <fpgalign/config.hpp>
#include <fpgalign/meta.hpp>

namespace search
{

struct hit
{
std::string id;
std::vector<uint8_t> seq;
std::vector<uint64_t> bins;
};

Expand All @@ -25,9 +24,7 @@ struct wip_alignment
size_t bin;
size_t sequence_number;
size_t position;
std::vector<uint8_t> seq;
std::vector<uint8_t> ref;
std::string id;
size_t idx;
};

class alignment_vector
Expand Down Expand Up @@ -57,7 +54,7 @@ class alignment_vector
};

void search(config const & config);
std::vector<hit> ibf(config const & config, size_t & todo_bin_count);
std::vector<wip_alignment> fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count);
std::vector<hit> ibf(config const & config, meta & meta);
std::vector<wip_alignment> fmindex(config const & config, meta & meta, std::vector<hit> hits);

} // namespace search
17 changes: 0 additions & 17 deletions src/argument_parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,6 @@ config parse_arguments(sharg::parser & parser)
sharg::config{.short_id = '\0',
.long_id = "threads",
.description = "The number of threads to use."}); // positive_integer_validator

parser.add_subsection("k-mer options");
parser.add_option(config.kmer_size,
sharg::config{.short_id = '\0',
.long_id = "kmer",
.description = "The k-mer size.",
.validator = sharg::arithmetic_range_validator{1, 32}});
parser.add_option(config.window_size,
sharg::config{.short_id = '\0',
.long_id = "window",
.description = "The window size.",
.default_message = "k-mer size"});
parser.add_option(config.errors,
sharg::config{.short_id = '\0',
.long_id = "errors",
Expand All @@ -118,11 +106,6 @@ config parse_arguments(sharg::parser & parser)

parser.parse();

if (parser.is_option_set("kmer") && !parser.is_option_set("window"))
config.window_size = config.kmer_size;
else if (config.window_size < config.kmer_size)
throw sharg::validation_error{"k-mer size cannot be smaller than window size!"};

return config;
}

Expand Down
20 changes: 18 additions & 2 deletions src/build/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <cassert>
#include <fstream>
#include <sstream>

#include <fmt/format.h>

#include <cereal/archives/binary.hpp>
#include <fpgalign/build/build.hpp>
#include <fpgalign/meta.hpp>

namespace build
{
Expand Down Expand Up @@ -37,8 +42,19 @@ std::vector<std::vector<std::string>> parse_input(config const & config)

void build(config const & config)
{
build::ibf(config);
build::fmindex(config);
meta meta{};
meta.bin_paths = parse_input(config);
meta.number_of_bins = meta.bin_paths.size();
build::ibf(config, meta);
assert(meta.kmer_size == config.kmer_size);
assert(meta.window_size == config.window_size);
build::fmindex(config, meta);

{
std::ofstream os{fmt::format("{}.meta", config.output_path.c_str()), std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(meta);
}
}

} // namespace build
43 changes: 28 additions & 15 deletions src/build/fmindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

#include <seqan3/io/sequence_file/input.hpp>

#include <hibf/contrib/std/enumerate_view.hpp>

#include <fmindex-collection/fmindex/BiFMIndex.h>

#include <fpgalign/build/build.hpp>
Expand All @@ -20,15 +18,17 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna
using sequence_alphabet = seqan3::dna4;
};

std::vector<std::vector<uint8_t>> read_reference(std::vector<std::string> const & bin_paths)
void read_reference_into(std::vector<std::vector<uint8_t>> & reference, meta & meta, size_t const i)
{
std::vector<std::vector<uint8_t>> reference{};
reference.clear();

for (auto const & bin_path : bin_paths)
for (auto const & bin_path : meta.bin_paths[i])
{
seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::seq>> fin{bin_path};
seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::seq, seqan3::field::id>> fin{bin_path};

for (auto && record : fin)
{
meta.ref_ids[i].push_back(record.id());
reference.push_back({});
std::ranges::copy(record.sequence()
| std::views::transform(
Expand All @@ -39,21 +39,34 @@ std::vector<std::vector<uint8_t>> read_reference(std::vector<std::string> const
std::back_inserter(reference.back()));
}
}

return reference;
}

void fmindex(config const & config)
void fmindex(config const & config, meta & meta)
{
for (auto && [id, bin_paths] : seqan::stl::views::enumerate(parse_input(config)))
meta.ref_ids.resize(meta.number_of_bins);

#pragma omp parallel num_threads(config.threads)
{
auto reference = read_reference(bin_paths);
fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads};
std::vector<std::vector<uint8_t>> reference;

#pragma omp for
for (size_t i = 0; i < meta.number_of_bins; ++i)
{
std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), id), std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(index);
read_reference_into(reference, meta, i);

fmc::BiFMIndex<5> index{reference, /*samplingRate*/ 16, config.threads};

{
std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), i), std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(index);
}

{
std::ofstream os{fmt::format("{}.{}.ref", config.output_path.c_str(), i), std::ios::binary};
cereal::BinaryOutputArchive oarchive{os};
oarchive(reference);
}
}
}
}
Expand Down
9 changes: 5 additions & 4 deletions src/build/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept
return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size);
}

void ibf(config const & config)
void ibf(config const & config, meta & meta)
{
auto const bin_paths = parse_input(config);
meta.kmer_size = config.kmer_size;
meta.window_size = config.window_size;

auto get_user_bin_data = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
Expand All @@ -36,7 +37,7 @@ void ibf(config const & config)
seqan3::window_size{config.window_size},
seqan3::seed{adjust_seed(config.kmer_size)});

for (auto && bin_path : bin_paths[user_bin_id])
for (auto && bin_path : meta.bin_paths[user_bin_id])
{
sequence_file_t fin{bin_path};
for (auto && record : fin)
Expand All @@ -58,7 +59,7 @@ void ibf(config const & config)
};

seqan::hibf::config ibf_config{.input_fn = get_user_bin_data,
.number_of_user_bins = bin_paths.size(),
.number_of_user_bins = meta.bin_paths.size(),
.number_of_hash_functions = config.hash_count,
.maximum_fpr = config.fpr,
.threads = config.threads};
Expand Down
55 changes: 26 additions & 29 deletions src/search/fmindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@
namespace search
{

template <typename Index>
auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector<uint8_t>
{
//TODO: möglicher weise ist das identisch zu einfach index.C[1]
auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0];
for (size_t i{0}; i < totalNumberOfSeq; ++i)
{
auto idx = std::get<0>(std::get<0>(index.locate(i)));
if (idx == seqNbr)
{
return reconstructText(index, i);
}
}
throw std::runtime_error{"unknown sequence number"};
}
// template <typename Index>
// auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector<uint8_t>
// {
// //TODO: möglicher weise ist das identisch zu einfach index.C[1]
// auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0];
// for (size_t i{0}; i < totalNumberOfSeq; ++i)
// {
// auto idx = std::get<0>(std::get<0>(index.locate(i)));
// if (idx == seqNbr)
// {
// return reconstructText(index, i);
// }
// }
// throw std::runtime_error{"unknown sequence number"};
// }

fmc::BiFMIndex<5> load_index(config const & config, size_t const id)
{
Expand Down Expand Up @@ -68,15 +68,16 @@ fmc::BiFMIndex<5> load_index(config const & config, size_t const id)
return index;
}

std::vector<wip_alignment> fmindex(config const & config, std::vector<hit> hits, size_t const todo_bin_count)
std::vector<wip_alignment> fmindex(config const & config, meta & meta, std::vector<hit> hits)
{
// todo bin count
// todo capacity
// each slot = 1 bin
// a cart is full if it has 5 elements (hits)
alignment_vector res;
{
scq::slotted_cart_queue<size_t> queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}};
scq::slotted_cart_queue<size_t> queue{
{.slots = meta.number_of_bins, .carts = meta.number_of_bins, .capacity = 5}};
size_t thread_id{};

auto get_thread = [&]()
Expand All @@ -93,30 +94,26 @@ std::vector<wip_alignment> fmindex(config const & config, std::vector<hit> hits,
auto index = load_index(config, slot.value);
for (auto idx : span)
{
auto & [id, seq, bins] = hits[idx];

auto callback = [&](auto cursor, size_t)
{
for (auto j : cursor)
{
auto [entry, offset] = index.locate(j);
auto [seqId, pos] = entry;
// fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n",
// thread_id,
// id,
// slot.value,
// seqId,
// pos + offset);
res.emplace_back(wip_alignment{.bin = slot.value,
.sequence_number = seqId,
.position = pos + offset,
.seq = seq,
.ref = myReconstruct(index, seqId),
.id = id}); // todo seq is copied
.idx = idx});
}
};

fmc::search<true>(index, seq, config.errors, callback);
auto seq_view = std::views::transform(meta.queries[idx].sequence(),
[](seqan3::dna4 const in) -> uint8_t
{
return in.to_rank() + 1u;
});

fmc::search<true>(index, seq_view, config.errors, callback);
}
}
});
Expand Down
Loading