diff --git a/.clang-format b/.clang-format index 6e14e76..891f5a3 100644 --- a/.clang-format +++ b/.clang-format @@ -118,12 +118,18 @@ IncludeCategories: Priority: 1 - Regex: '(<[[:alnum:]._]+>)' Priority: 2 - - Regex: ' . + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cmake/cxx_config.cmake b/cmake/cxx_config.cmake index ddeebef..6160df3 100644 --- a/cmake/cxx_config.cmake +++ b/cmake/cxx_config.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause if (NOT DEFINED CMAKE_CXX_STANDARD) set (CMAKE_CXX_STANDARD 23) diff --git a/cmake/output_directories.cmake b/cmake/output_directories.cmake index 4c21d92..121c89e 100644 --- a/cmake/output_directories.cmake +++ b/cmake/output_directories.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake index 307ba16..748e267 100644 --- a/cmake/package-lock.cmake +++ b/cmake/package-lock.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause # CPM Package Lock (https://github.com/cpm-cmake/CPM.cmake) # This file should be committed to version control @@ -8,7 +8,7 @@ # cmake-format: off # hibf -set (HIBF_VERSION 36c223527d7d4eb2bcdcd88d47c71984f50a00a3 CACHE STRING "" FORCE) +set (HIBF_VERSION 36c223527d7d4eb2bcdcd88d47c71984f50a00a3 CACHE STRING "") CPMDeclarePackage (hibf NAME hibf GIT_TAG ${HIBF_VERSION} # main @@ -19,7 +19,7 @@ CPMDeclarePackage (hibf ) # sharg -set (SHARG_VERSION be113bcffe49c0d62cbd65a191820f05386aa8da CACHE STRING "" FORCE) +set (SHARG_VERSION be113bcffe49c0d62cbd65a191820f05386aa8da CACHE STRING "") CPMDeclarePackage (sharg NAME sharg GIT_TAG ${SHARG_VERSION} # main @@ -30,7 +30,7 @@ CPMDeclarePackage (sharg ) # seqan3 -set (SEQAN3_VERSION 6dfa1b442d1fabd07024edcc37a29b61d5beae8f CACHE STRING "" FORCE) +set (SEQAN3_VERSION 6dfa1b442d1fabd07024edcc37a29b61d5beae8f CACHE STRING "") CPMDeclarePackage (seqan3 NAME seqan3 GIT_TAG ${SEQAN3_VERSION} # main @@ -40,8 +40,20 @@ CPMDeclarePackage (seqan3 OPTIONS "INSTALL_SEQAN3 OFF" "CMAKE_MESSAGE_LOG_LEVEL WARNING" ) +# fmt +set (FMT_VERSION 11.2.0 CACHE STRING "") +CPMDeclarePackage (fmt + NAME fmt + VERSION ${FMT_VERSION} + GIT_TAG ${FMT_VERSION} + GITHUB_REPOSITORY fmtlib/fmt + SYSTEM TRUE + EXCLUDE_FROM_ALL TRUE + OPTIONS "CMAKE_MESSAGE_LOG_LEVEL WARNING" "FMT_INSTALL OFF" +) + # fmindex -set (FMINDEX_VERSION 7ed8c3a6035a898f3e9574c5aa96af8b74ca6ef4 CACHE STRING "" FORCE) +set (FMINDEX_VERSION 7ed8c3a6035a898f3e9574c5aa96af8b74ca6ef4 CACHE STRING "") CPMDeclarePackage (fmindex NAME fmindex GIT_TAG ${FMINDEX_VERSION} # main @@ -52,7 +64,7 @@ CPMDeclarePackage (fmindex ) # googletest -set (GOOGLETEST_VERSION 1.17.0 CACHE STRING "" FORCE) +set (GOOGLETEST_VERSION 1.17.0 CACHE STRING "") CPMDeclarePackage (googletest NAME GTest VERSION ${GOOGLETEST_VERSION} @@ -63,7 +75,7 @@ CPMDeclarePackage (googletest ) # use_ccache -set (USE_CCACHE_VERSION d2a54ef555b6fc2d496a4c9506dbeb7cf899ce37 CACHE STRING "" FORCE) +set (USE_CCACHE_VERSION d2a54ef555b6fc2d496a4c9506dbeb7cf899ce37 CACHE STRING "") CPMDeclarePackage (use_ccache NAME use_ccache GIT_TAG ${USE_CCACHE_VERSION} # main @@ -73,4 +85,13 @@ CPMDeclarePackage (use_ccache EXCLUDE_FROM_ALL TRUE ) +# thresholding +CPMDeclarePackage (thresholding + NAME thresholding + URL "${PROJECT_SOURCE_DIR}/contrib/threshold.tar.gz" + URL_HASH SHA256=4990c7fb9778a2fb8a19794b966d57496ca77bcd708b4cee3c93eea6e5b67d80 + SYSTEM TRUE + EXCLUDE_FROM_ALL TRUE +) + # cmake-format: on diff --git a/cmake/test/add_local_data.cmake b/cmake/test/add_local_data.cmake index 4a7bd0c..20ffde2 100644 --- a/cmake/test/add_local_data.cmake +++ b/cmake/test/add_local_data.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause set (DATASOURCES_DATA_DIR "${FPGAlign_SOURCE_DIR}/test/data") @@ -22,7 +22,7 @@ foreach (datasource IN LISTS datasources) # If a matching file contains `@data_dir@`, it will be replaced with the actual data directory. # This is useful if you have test files that need to reference the data directory, e.g., a file containing # paths to other files. - if (FALSE) + if (datasource_name MATCHES ".*\.txt$") configure_file ("${DATASOURCES_DATA_DIR}/${datasource}" "${data_dir}/${datasource_name}") else () configure_file ("${DATASOURCES_DATA_DIR}/${datasource}" "${data_dir}/${datasource_name}" COPYONLY) diff --git a/cmake/test/config.cmake b/cmake/test/config.cmake index 8388704..1ad22eb 100644 --- a/cmake/test/config.cmake +++ b/cmake/test/config.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause list (APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # Must be before `enable_testing ()`. list (APPEND CMAKE_CTEST_ARGUMENTS "--no-tests=error") # Must be before `enable_testing ()`. diff --git a/cmake/test/declare_datasource.cmake b/cmake/test/declare_datasource.cmake index 00b058e..5802adf 100644 --- a/cmake/test/declare_datasource.cmake +++ b/cmake/test/declare_datasource.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause if (CMAKE_VERSION VERSION_LESS 3.30) include (ExternalProject) diff --git a/contrib/threshold.tar.gz b/contrib/threshold.tar.gz new file mode 100644 index 0000000..4d72179 Binary files /dev/null and b/contrib/threshold.tar.gz differ diff --git a/contrib/threshold.tar.gz.license b/contrib/threshold.tar.gz.license new file mode 100644 index 0000000..e07c809 --- /dev/null +++ b/contrib/threshold.tar.gz.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +SPDX-License-Identifier: BSD-3-Clause diff --git a/include/configuration.hpp b/include/configuration.hpp deleted file mode 100644 index e59798c..0000000 --- a/include/configuration.hpp +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#pragma once - -#include - -struct configuration -{ - std::filesystem::path fastq_input{}; - std::filesystem::path fasta_output{}; - bool verbose{}; // Default is false. -}; diff --git a/include/fastq_conversion.hpp b/include/fastq_conversion.hpp deleted file mode 100644 index 4f02b21..0000000 --- a/include/fastq_conversion.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#pragma once - -#include "configuration.hpp" - -/*! \brief Function, converting fastq files to fasta files. - * \param config The configuration. - * - * Simple function, converting fastq files to fasta files using the seqan3 library. - * For more information about the SeqAn Library functions see https://docs.seqan.de/seqan3/main_user. - */ -void convert_fastq(configuration const & config); diff --git a/include/fpgalign/argument_parsing.hpp b/include/fpgalign/argument_parsing.hpp new file mode 100644 index 0000000..142dfea --- /dev/null +++ b/include/fpgalign/argument_parsing.hpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +enum class subcommand : uint8_t +{ + build, + search +}; + +struct parse_result +{ + subcommand subcmd; + config cfg; +}; + +parse_result parse_arguments(std::vector command_line); diff --git a/include/fpgalign/build/build.hpp b/include/fpgalign/build/build.hpp new file mode 100644 index 0000000..4e7e5cf --- /dev/null +++ b/include/fpgalign/build/build.hpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +namespace build +{ + +std::vector> parse_input(config const & config); + +void build(config const & config); +void ibf(config const & config); +void fmindex(config const & config); + +} // namespace build diff --git a/include/fpgalign/colored_strings.hpp b/include/fpgalign/colored_strings.hpp new file mode 100644 index 0000000..c92355d --- /dev/null +++ b/include/fpgalign/colored_strings.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include + +struct colored_strings +{ + static bool const cerr_is_terminal; + + struct cerr + { + static std::string const error; + static std::string const warning; + }; +}; diff --git a/include/fpgalign/config.hpp b/include/fpgalign/config.hpp new file mode 100644 index 0000000..7bd2775 --- /dev/null +++ b/include/fpgalign/config.hpp @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include + +struct config +{ + uint8_t kmer_size{20u}; + uint32_t window_size{kmer_size}; + + uint8_t hash_count{2u}; + double fpr{0.05}; + + std::filesystem::path input_path{}; + std::filesystem::path output_path{}; + std::filesystem::path query_path{}; + uint8_t errors{}; + uint16_t threads{1u}; +}; diff --git a/include/fpgalign/contrib/slotted_cart_queue.hpp b/include/fpgalign/contrib/slotted_cart_queue.hpp new file mode 100644 index 0000000..a8fafde --- /dev/null +++ b/include/fpgalign/contrib/slotted_cart_queue.hpp @@ -0,0 +1,515 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +// IWYU pragma: begin_exports +#include // for atomic, atomic_bool +#include // for assert +#include // for condition_variable +#include // for future_errc, future_error +#include // for mutex, unique_lock, scoped_lock +#include // for optional +#include // for span +#include // for runtime_error, logic_error, overflow_error +// IWYU pragma: end_exports + +#include // for size_t, ptrdiff_t +#include // for char_traits, operator+, basic_string, to_string, string +#include // for pair +#include // for allocator, move, vector + +namespace scq +{ + +struct params +{ + size_t slots; + size_t carts; + size_t capacity; +}; + +struct slots +{ + size_t value; +}; + +struct carts +{ + size_t value; +}; + +struct capacity +{ + size_t value; +}; + +struct slot_id +{ + size_t value; +}; + +template +class slotted_cart_queue; + +template +class cart_future +{ +public: + cart_future() = default; + cart_future(cart_future const &) = delete; + cart_future(cart_future &&) = default; + cart_future & operator=(cart_future const &) = delete; + cart_future & operator=(cart_future &&) = default; + ~cart_future() + { + if (valid()) + cart_queue->notify_processed_cart(*this); + } + + using value_type = value_t; + + bool valid() const + { + return cart_queue != nullptr; + } + + std::pair> get() + { + if (!valid()) // slotted_cart_queue is already closed and no further elements. + throw std::future_error{std::future_errc::no_state}; + + return {id, memory_region}; + } + +private: + template + friend class slotted_cart_queue; + + scq::slot_id id{}; + std::span memory_region{}; + + slotted_cart_queue * cart_queue{nullptr}; +}; + +template +class slotted_cart_queue +{ + struct cart_memory_id + { + size_t value; + }; + struct queue_memory_t; + struct cart_slots_t; + struct empty_carts_queue_t; + struct full_carts_queue_t; + +public: + using value_type = value_t; + using cart_future_type = cart_future; + + slotted_cart_queue() = default; + slotted_cart_queue(slotted_cart_queue const &) = delete; + slotted_cart_queue(slotted_cart_queue &&) = delete; // TODO: + slotted_cart_queue & operator=(slotted_cart_queue const &) = delete; + slotted_cart_queue & operator=(slotted_cart_queue &&) = delete; // TODO: + + slotted_cart_queue(params params) : + slot_count{params.slots}, + cart_count{params.carts}, + cart_capacity{params.capacity} + { + if (cart_count < slot_count) + throw std::logic_error{"The number of carts must be >= the number of slots."}; + + if (cart_capacity == 0u) + throw std::logic_error{"The cart capacity must be >= 1."}; + } + + void enqueue(slot_id slot, value_type value) + { + using full_cart_type = typename full_carts_queue_t::full_cart_type; + + bool full_queue_was_empty{}; + bool queue_was_closed{}; + + std::optional full_cart{}; + + { + std::unique_lock cart_management_lock(cart_management_mutex); + + queue_was_closed = queue_closed; + + auto slot_cart = cart_slots.slot(slot); + + if (!queue_was_closed && slot_cart.empty()) + { + empty_cart_queue_empty_or_closed_cv.wait(cart_management_lock, + [this, &slot_cart] + { + // wait until either an empty cart is ready, or the slot has a cart, or the queue was closed + return !empty_carts_queue.empty() || !slot_cart.empty() + || queue_closed == true; + }); + + queue_was_closed = queue_closed; + + // if the current slot still has no cart and we have an available empty cart, use that empty cart in + // this slot + if (!queue_was_closed && slot_cart.empty()) + { + // this assert must be true because of the condition within empty_cart_queue_empty_or_closed_cv + assert(!empty_carts_queue.empty()); + + std::span memory_region = empty_carts_queue.dequeue(); + slot_cart.set_memory_region(memory_region); + assert_cart_count_variant(); + } + } + + if (!queue_was_closed) + { + slot_cart.emplace_back(std::move(value)); + + if (slot_cart.full()) + { + full_cart = full_carts_queue_t::move_slot_cart_to_full_cart(slot_cart); + } + } + } + + if (full_cart.has_value()) + { + std::unique_lock full_cart_queue_lock(full_cart_queue_mutex); + + full_queue_was_empty = full_carts_queue.empty(); + + // enqueue later + full_carts_queue.enqueue(std::move(*full_cart)); + assert_cart_count_variant(); + } + + if (full_queue_was_empty) + full_cart_queue_empty_or_closed_cv.notify_all(); + + if (queue_was_closed) + throw std::overflow_error{"slotted_cart_queue is already closed."}; + } + + cart_future_type dequeue() + { + cart_future_type cart_future{}; + + { + std::unique_lock full_cart_queue_lock(full_cart_queue_mutex); + + full_cart_queue_empty_or_closed_cv.wait(full_cart_queue_lock, + [this] + { + // wait until first cart is full + return !full_carts_queue.empty() || queue_closed == true; + }); + + if (!full_carts_queue.empty()) + { + auto full_cart = full_carts_queue.dequeue(); + cart_future.id = full_cart.first; + cart_future.memory_region = std::move(full_cart.second); + cart_future.cart_queue = this; + assert_cart_count_variant(); + } + } + + // NOTE: cart memory will be released by notify_processed_cart after cart_future was destroyed + return cart_future; + } + + void close() + { + { + // this locks the whole queue + std::scoped_lock lock(cart_management_mutex, full_cart_queue_mutex); + + queue_closed = true; + cart_slots.move_active_carts_into_full_carts_queue(full_carts_queue); + assert_cart_count_variant(); + } + + empty_cart_queue_empty_or_closed_cv.notify_all(); + full_cart_queue_empty_or_closed_cv.notify_all(); + } + +private: + size_t slot_count{}; + size_t cart_count{}; + size_t cart_capacity{}; + + queue_memory_t queue_memory{scq::carts{cart_count}, scq::capacity{cart_capacity}}; + empty_carts_queue_t empty_carts_queue{scq::carts{cart_count}, queue_memory}; + full_carts_queue_t full_carts_queue{scq::carts{cart_count}}; + + friend cart_future_type; + + void assert_cart_count_variant() + { + empty_carts_queue.check_invariant(); + full_carts_queue.check_invariant(); + } + + void notify_processed_cart(cart_future_type & cart_future) + { + bool empty_queue_was_empty{}; + { + std::unique_lock cart_management_lock(cart_management_mutex); + + empty_queue_was_empty = empty_carts_queue.empty(); + + empty_carts_queue.enqueue(cart_future.memory_region); + assert_cart_count_variant(); + } + + if (empty_queue_was_empty) + empty_cart_queue_empty_or_closed_cv.notify_all(); + } + + std::atomic_bool queue_closed{false}; + + cart_slots_t cart_slots{scq::slots{slot_count}, scq::capacity{cart_capacity}}; + + std::mutex cart_management_mutex; + std::condition_variable empty_cart_queue_empty_or_closed_cv; + std::mutex full_cart_queue_mutex; + std::condition_variable full_cart_queue_empty_or_closed_cv; +}; + +template +struct slotted_cart_queue::queue_memory_t +{ + queue_memory_t() = default; + queue_memory_t(scq::carts carts, scq::capacity capacity) : + cart_capacity{capacity.value}, + internal_queue_memory(carts.value * capacity.value) + {} + + std::span memory_region(cart_memory_id cart_memory_id) + { + size_t size = cart_capacity; + value_t * begin = internal_queue_memory.data() + cart_memory_id.value * size; + return {begin, size}; + } + + size_t cart_capacity{}; + + std::vector internal_queue_memory{}; +}; + +template +struct slotted_cart_queue::cart_slots_t +{ + cart_slots_t() = default; + cart_slots_t(scq::slots slots, scq::capacity capacity) : + cart_capacity{capacity.value}, + internal_cart_slots(slots.value) // default init slots many vectors + {} + + struct slot_cart_t + { + size_t _slot_id; + size_t cart_capacity; + std::span * memory_region_ptr; + + scq::slot_id slot_id() const + { + return {_slot_id}; + } + + size_t size() const + { + return memory_region().size(); + } + + size_t capacity() const + { + return cart_capacity; + } + + bool empty() const + { + return memory_region().empty(); + } + + bool full() const + { + return size() >= capacity(); + } + + void emplace_back(value_t value) + { + assert(size() < capacity()); + std::span _memory_region = memory_region(); + _memory_region = std::span(_memory_region.data(), _memory_region.size() + 1u); + _memory_region.back() = std::move(value); + set_memory_region(_memory_region); + } + + void set_memory_region(std::span memory_region_span) + { + assert(memory_region_ptr != nullptr); + *memory_region_ptr = std::span{memory_region_span}; + } + + std::span memory_region() const + { + assert(memory_region_ptr != nullptr); + return *memory_region_ptr; + } + }; + + slot_cart_t slot(scq::slot_id slot_id) + { + std::span & memory_region = internal_cart_slots[slot_id.value]; + return {slot_id.value, cart_capacity, &memory_region}; + } + + void move_active_carts_into_full_carts_queue(full_carts_queue_t & full_carts_queue) + { + // TODO: if pending slots are more than queue capacity? is that a problem? + + // put all non-empty / non-full carts into full queue (no element can't be added any more and all pending + // elements = active to fill elements must be processed) + for (size_t slot_id = 0u; slot_id < internal_cart_slots.size(); ++slot_id) + { + auto slot_cart = slot(scq::slot_id{slot_id}); + if (!slot_cart.empty()) + { + auto full_cart = full_carts_queue_t::move_slot_cart_to_full_cart(slot_cart); + full_carts_queue.enqueue(full_cart); + full_carts_queue.check_invariant(); + } + } + } + + size_t cart_capacity{}; + + std::vector> internal_cart_slots{}; // position is slot_id +}; + +template +struct slotted_cart_queue::empty_carts_queue_t +{ + empty_carts_queue_t() = default; + empty_carts_queue_t(carts carts, queue_memory_t & queue_memory) : + count{static_cast(carts.value)}, + cart_count{carts.value}, + internal_queue{} + { + internal_queue.reserve(count); + + for (size_t i = 0; i < cart_count; ++i) + internal_queue.push_back(queue_memory.memory_region(cart_memory_id{i})); + } + + bool empty() + { + return count == 0; + } + + void enqueue(std::span memory_region) + { + internal_queue.push_back(std::span{memory_region.data(), 0}); + + ++count; + } + + std::span dequeue() + { + --count; + + std::span memory_region{internal_queue.back().data(), 0}; + internal_queue.pop_back(); + return memory_region; + } + + void check_invariant() + { + assert(0 <= count); + assert(count <= static_cast(cart_count)); + + if (!(0 <= count)) + throw std::runtime_error{"empty_carts_queue.count: negative"}; + + if (!(count <= static_cast(cart_count))) + throw std::runtime_error{std::string{"empty_carts_queue.count: FULL, count: "} + std::to_string(count) + + " <= " + std::to_string(cart_count)}; + } + + std::atomic count{}; + size_t cart_count{}; + + std::vector> internal_queue{}; +}; + +template +struct slotted_cart_queue::full_carts_queue_t +{ + using full_cart_type = std::pair>; + using slot_cart_type = typename cart_slots_t::slot_cart_t; + + full_carts_queue_t() = default; + full_carts_queue_t(carts carts) : count{0}, cart_count{carts.value} + { + internal_queue.reserve(cart_count); + } + + bool empty() + { + return count == 0; + } + + static full_cart_type move_slot_cart_to_full_cart(slot_cart_type & slot_cart) + { + assert(slot_cart.size() > 0); // at least one element + assert(slot_cart.size() <= slot_cart.capacity()); // at most cart capacity many elements + + full_cart_type full_cart{slot_cart.slot_id(), slot_cart.memory_region()}; + slot_cart.set_memory_region(std::span{}); // reset slot + return full_cart; + } + + void enqueue(full_cart_type full_cart) + { + ++count; + + internal_queue.push_back(full_cart); + } + + full_cart_type dequeue() + { + --count; + + full_cart_type tmp = std::move(internal_queue.back()); + internal_queue.pop_back(); + return tmp; + } + + void check_invariant() + { + assert(0 <= count); + assert(count <= static_cast(cart_count)); + + if (!(0 <= count)) + throw std::runtime_error{"full_carts_queue.count: negative"}; + + if (!(count <= static_cast(cart_count))) + throw std::runtime_error{std::string{"full_carts_queue.count: FULL, count: "} + std::to_string(count) + + " <= " + std::to_string(cart_count)}; + } + + std::atomic count{}; + size_t cart_count{}; + + std::vector internal_queue{}; +}; + +} // namespace scq diff --git a/include/fpgalign/search/search.hpp b/include/fpgalign/search/search.hpp new file mode 100644 index 0000000..9ffe527 --- /dev/null +++ b/include/fpgalign/search/search.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +namespace search +{ + +struct hit +{ + std::string id; + std::vector seq; + std::vector bins; +}; + +void search(config const & config); +std::vector ibf(config const & config, size_t & todo_bin_count); +void fmindex(config const & config, std::vector hits, size_t const todo_bin_count); + +} // namespace search diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c4bf270..f661b2e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,19 +1,35 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause cmake_minimum_required (VERSION 3.25) +set (FPGAlign_SOURCE_FILES + argument_parsing.cpp + build/build.cpp + build/fmindex.cpp + build/ibf.cpp + colored_strings.cpp + search/ibf.cpp + search/fmindex.cpp + search/search.cpp +) + # An object library (without main) to be used in multiple targets. # You can add more external include paths of other projects that are needed for your project. -add_library (FPGAlign_lib STATIC fastq_conversion.cpp) +add_library (FPGAlign_lib STATIC ${FPGAlign_SOURCE_FILES}) + target_include_directories (FPGAlign_lib PUBLIC "${FPGAlign_SOURCE_DIR}/include") -target_link_libraries (FPGAlign_lib PUBLIC seqan3::seqan3 sharg::sharg seqan::hibf - fmindex-collection::fmindex-collection -) +target_link_libraries (FPGAlign_lib PUBLIC seqan3::seqan3 sharg::sharg seqan::hibf fmt::fmt fmindex-collection::fmindex-collection seqan::threshold) target_compile_options (FPGAlign_lib PUBLIC "-pedantic" "-Wall" "-Wextra") +option (FPGAlign_WITH_WERROR "Report compiler warnings as errors." ON) +if (FPGAlign_WITH_WERROR) + target_compile_options (FPGAlign_lib PUBLIC "-Werror") + message (STATUS "Building tests with -Werror.") +endif () + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # Disable warning about std::hardware_destructive_interference_size not being ABI-stable. if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) @@ -26,6 +42,13 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") endif () endif () +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # std::jthread is experimental in LLVM < 20 + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 20) + target_compile_definitions (FPGAlign_lib PUBLIC "_LIBCPP_ENABLE_EXPERIMENTAL") + endif () +endif () + # Add the application. -add_executable (FPGAlign main.cpp) +add_executable (FPGAlign fpgalign.cpp) target_link_libraries (FPGAlign PRIVATE FPGAlign_lib) diff --git a/src/argument_parsing.cpp b/src/argument_parsing.cpp new file mode 100644 index 0000000..620e5f1 --- /dev/null +++ b/src/argument_parsing.cpp @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +namespace build +{ + +config parse_arguments(sharg::parser & parser) +{ + config config{}; + + parser.add_subsection("General options"); + parser.add_option(config.input_path, + sharg::config{.short_id = '\0', + .long_id = "input", + .description = "A file containing file names", + .required = true, + .validator = sharg::input_file_validator{}}); + parser.add_option(config.output_path, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "Output path", + .required = true, + .validator = sharg::output_file_validator{}}); // .ibf and .fmindex + parser.add_option(config.threads, + sharg::config{.short_id = '\0', + .long_id = "threads", + .description = "The number of threads to use."}); // positive_integer_validator + + parser.add_subsection("k-mer options"); + parser.add_option(config.kmer_size, + sharg::config{.short_id = '\0', + .long_id = "kmer", + .description = "The k-mer size.", + .validator = sharg::arithmetic_range_validator{1, 32}}); + parser.add_option(config.window_size, + sharg::config{.short_id = '\0', + .long_id = "window", + .description = "The window size.", + .default_message = "k-mer size"}); + + parser.add_subsection("IBF options"); + parser.add_option(config.fpr, + sharg::config{.short_id = '\0', + .long_id = "fpr", + .description = "The false positive rate.", + .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); + parser.add_option(config.hash_count, + sharg::config{.short_id = '\0', + .long_id = "hash", + .description = "The number of hash functions to use.", + .validator = sharg::arithmetic_range_validator{1, 5}}); + + parser.parse(); + + if (parser.is_option_set("kmer") && !parser.is_option_set("window")) + config.window_size = config.kmer_size; + else if (config.window_size < config.kmer_size) + throw sharg::validation_error{"k-mer size cannot be smaller than window size!"}; + + return config; +} + +} // namespace build + +namespace search +{ + +config parse_arguments(sharg::parser & parser) +{ + config config{}; + + parser.add_subsection("General options"); + parser.add_option(config.input_path, + sharg::config{.short_id = '\0', + .long_id = "input", + .description = "Prefix", + .required = true}); // .ibf and .fmindex + parser.add_option(config.query_path, + sharg::config{.short_id = '\0', + .long_id = "query", + .description = "Query path", + .required = true, + .validator = sharg::input_file_validator{}}); + parser.add_option(config.output_path, + sharg::config{.short_id = '\0', + .long_id = "output", + .description = "Output path", + .required = true, + .validator = sharg::output_file_validator{}}); + parser.add_option(config.threads, + sharg::config{.short_id = '\0', + .long_id = "threads", + .description = "The number of threads to use."}); // positive_integer_validator + + parser.add_subsection("k-mer options"); + parser.add_option(config.kmer_size, + sharg::config{.short_id = '\0', + .long_id = "kmer", + .description = "The k-mer size.", + .validator = sharg::arithmetic_range_validator{1, 32}}); + parser.add_option(config.window_size, + sharg::config{.short_id = '\0', + .long_id = "window", + .description = "The window size.", + .default_message = "k-mer size"}); + parser.add_option(config.errors, + sharg::config{.short_id = '\0', + .long_id = "errors", + .description = "errors.", + .validator = sharg::arithmetic_range_validator{0, 5}}); + + parser.parse(); + + if (parser.is_option_set("kmer") && !parser.is_option_set("window")) + config.window_size = config.kmer_size; + else if (config.window_size < config.kmer_size) + throw sharg::validation_error{"k-mer size cannot be smaller than window size!"}; + + return config; +} + +} // namespace search + +parse_result parse_arguments(std::vector command_line) +{ + sharg::parser parser{"FPGAlign", std::move(command_line)}; + parser.info.author = "SeqAn-Team"; + parser.info.version = "1.0.0"; + parser.add_subcommands({"build", "search"}); + + parser.parse(); + + sharg::parser & sub_parser = parser.get_sub_parser(); + parse_result result{}; + + if (sub_parser.info.app_name == std::string_view{"FPGAlign-build"}) + { + result.subcmd = subcommand::build; + result.cfg = build::parse_arguments(sub_parser); + } + if (sub_parser.info.app_name == std::string_view{"FPGAlign-search"}) + { + result.subcmd = subcommand::search; + result.cfg = search::parse_arguments(sub_parser); + } + + return result; +} diff --git a/src/build/build.cpp b/src/build/build.cpp new file mode 100644 index 0000000..2d8e3d9 --- /dev/null +++ b/src/build/build.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include + +namespace build +{ + +std::vector> parse_input(config const & config) +{ + std::ifstream input_file{config.input_path}; + + std::vector> result; + std::string line; + std::string token; + + while (std::getline(input_file, line)) + { + if (line.empty()) + continue; + + std::vector tokens; + std::istringstream line_stream{line}; + + while (line_stream >> token) + tokens.push_back(token); + + result.push_back(std::move(tokens)); + } + + return result; +} + +void build(config const & config) +{ + build::ibf(config); + build::fmindex(config); +} + +} // namespace build diff --git a/src/build/fmindex.cpp b/src/build/fmindex.cpp new file mode 100644 index 0000000..0f5bcc8 --- /dev/null +++ b/src/build/fmindex.cpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include + +#include + +#include + +namespace build +{ + +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +std::vector> read_reference(std::vector const & bin_paths) +{ + std::vector> reference{}; + + for (auto const & bin_path : bin_paths) + { + seqan3::sequence_file_input> fin{bin_path}; + for (auto && record : fin) + { + reference.push_back({}); + std::ranges::copy(record.sequence() + | std::views::transform( + [](auto const & in) + { + return seqan3::to_rank(in); + }), + std::back_inserter(reference.back())); + } + } + + return reference; +} + +void fmindex(config const & config) +{ + for (auto && [id, bin_paths] : seqan::stl::views::enumerate(parse_input(config))) + { + auto reference = read_reference(bin_paths); + fmc::BiFMIndex<4> index{reference, /*samplingRate*/ 16, config.threads}; + + { + std::ofstream os{fmt::format("{}.{}.fmindex", config.output_path.c_str(), id), std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(index); + } + } +} + +} // namespace build diff --git a/src/build/ibf.cpp b/src/build/ibf.cpp new file mode 100644 index 0000000..4da2193 --- /dev/null +++ b/src/build/ibf.cpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include +#include + +#include +#include + +namespace build +{ + +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept +{ + return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); +} + +void ibf(config const & config) +{ + auto const bin_paths = parse_input(config); + + auto get_user_bin_data = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + using sequence_file_t = seqan3::sequence_file_input>; + + auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{config.kmer_size}, + seqan3::window_size{config.window_size}, + seqan3::seed{adjust_seed(config.kmer_size)}); + + for (auto && bin_path : bin_paths[user_bin_id]) + { + sequence_file_t fin{bin_path}; + for (auto && record : fin) + { + if (size_t const record_size = record.sequence().size(); record_size < config.window_size) + { +#pragma omp critical + { + std::cerr << colored_strings::cerr::warning << "File " << std::quoted(bin_path) + << " contains a sequence of length " << record_size + << ". This is shorter than the window size (" << config.window_size + << ") and will result in no k-mers being generated for this sequence. A user bin " + "without k-mers will result in an error.\n"; + } + } + std::ranges::copy(record.sequence() | minimiser_view, it); + } + } + }; + + seqan::hibf::config ibf_config{.input_fn = get_user_bin_data, + .number_of_user_bins = bin_paths.size(), + .number_of_hash_functions = config.hash_count, + .maximum_fpr = config.fpr, + .threads = config.threads}; + + seqan::hibf::interleaved_bloom_filter ibf{ibf_config}; + + { + std::ofstream os{config.output_path.string() + ".ibf", std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(ibf); + } +} + +} // namespace build diff --git a/src/colored_strings.cpp b/src/colored_strings.cpp new file mode 100644 index 0000000..bd22516 --- /dev/null +++ b/src/colored_strings.cpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include + +bool const colored_strings::cerr_is_terminal = sharg::detail::stderr_is_terminal(); + +std::string const colored_strings::cerr::error = []() -> std::string +{ + if (cerr_is_terminal) + return fmt::format(fg(fmt::color::red), "[Error] "); + else + return "[Error] "; +}(); + +std::string const colored_strings::cerr::warning = []() -> std::string +{ + if (cerr_is_terminal) + return fmt::format(fg(fmt::color::yellow), "[Warning] "); + else + return "[Warning] "; +}(); diff --git a/src/fastq_conversion.cpp b/src/fastq_conversion.cpp deleted file mode 100644 index 5576aab..0000000 --- a/src/fastq_conversion.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include "fastq_conversion.hpp" - -#include - -void convert_fastq(configuration const & config) -{ - seqan3::sequence_file_input file_input{config.fastq_input}; - seqan3::sequence_file_output file_output{std::cout, seqan3::format_fasta{}}; - if (!config.fasta_output.empty()) - file_output = config.fasta_output; - - file_output = file_input; // conversion -} diff --git a/src/fpgalign.cpp b/src/fpgalign.cpp new file mode 100644 index 0000000..d8ad5d7 --- /dev/null +++ b/src/fpgalign.cpp @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) +{ + try + { + parse_result const result = parse_arguments({argv, argv + argc}); + + if (result.subcmd == subcommand::build) + build::build(result.cfg); + if (result.subcmd == subcommand::search) + search::search(result.cfg); + } + catch (std::exception const & ext) + { + std::cerr << colored_strings::cerr::error << ext.what() << '\n'; + std::exit(-1); + } + + return 0; +} diff --git a/src/main.cpp b/src/main.cpp deleted file mode 100644 index aba44b9..0000000 --- a/src/main.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include - -#include "fastq_conversion.hpp" - -int main(int argc, char ** argv) -{ - // Configuration - configuration config{}; - - // Parser - sharg::parser parser{"Fastq-to-Fasta-Converter", argc, argv}; - - // General information. - parser.info.author = "SeqAn-Team"; - parser.info.version = "1.0.0"; - - // Positional option: The FASTQ file to convert. - parser.add_positional_option(config.fastq_input, - sharg::config{.description = "The FASTQ file to convert.", - .validator = sharg::input_file_validator{{"fq", "fastq"}}}); - - // Open: Output FASTA file. Default: print to terminal - handled in fastq_conversion.cpp. - parser.add_option(config.fasta_output, - sharg::config{.short_id = 'o', - .long_id = "output", - .description = "The output FASTA file.", - .default_message = "Print to terminal (stdout)", - .validator = sharg::output_file_validator{}}); - - // Flag: Verose output. - parser.add_flag( - config.verbose, - sharg::config{.short_id = 'v', .long_id = "verbose", .description = "Give more detailed information."}); - - try - { - parser.parse(); // Trigger command line parsing. - } - catch (sharg::parser_error const & ext) // Catch user errors. - { - std::cerr << "Parsing error. " << ext.what() << '\n'; // Give error message. - return -1; - } - - convert_fastq(config); // Call fastq to fasta converter. - - if (config.verbose) // If flag is set. - std::cerr << "Conversion was a success. Congrats!\n"; - - return 0; -} diff --git a/src/search/fmindex.cpp b/src/search/fmindex.cpp new file mode 100644 index 0000000..194e868 --- /dev/null +++ b/src/search/fmindex.cpp @@ -0,0 +1,91 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include + +#include +#include + +#include +#include + +namespace search +{ + +fmc::BiFMIndex<4> load_index(config const & config, size_t const id) +{ + fmc::BiFMIndex<4> index{}; + + { + std::ifstream os{fmt::format("{}.{}.fmindex", config.input_path.c_str(), id), std::ios::binary}; + cereal::BinaryInputArchive iarchive{os}; + iarchive(index); + } + + return index; +} + +void fmindex(config const & config, std::vector hits, size_t const todo_bin_count) +{ + // todo bin count + // todo capacity + // each slot = 1 bin + // a cart is full if it has 5 elements (hits) + scq::slotted_cart_queue queue{{.slots = todo_bin_count, .carts = todo_bin_count, .capacity = 5}}; + size_t thread_id{}; + + auto get_thread = [&]() + { + return std::jthread( + [&, thread_id = thread_id++]() + { + while (true) + { + scq::cart_future cart = queue.dequeue(); + if (!cart.valid()) + return; + auto [slot, span] = cart.get(); + auto index = load_index(config, slot.value); + for (auto idx : span) + { + auto & [id, seq, bins] = hits[idx]; + + auto callback = [&](auto cursor, size_t) + { + for (auto j : cursor) + { + auto [entry, offset] = index.locate(j); + auto [seqId, pos] = entry; + { + fmt::print("[{}][{}] found hit in bin {} in seqNo {} at Pos {}\n", + thread_id, + id, + slot.value, + seqId, + pos + offset); + } + } + }; + + fmc::search(index, seq, config.errors, callback); + } + } + }); + }; + + std::vector worker(config.threads); + std::ranges::generate(worker, get_thread); + + for (auto && [idx, hit] : seqan::stl::views::enumerate(hits)) + for (auto bin : hit.bins) + queue.enqueue(scq::slot_id{bin}, idx); + + queue.close(); +} + +} // namespace search diff --git a/src/search/ibf.cpp b/src/search/ibf.cpp new file mode 100644 index 0000000..b215e2e --- /dev/null +++ b/src/search/ibf.cpp @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include +#include + +#include +#include + +#include +#include + +namespace search +{ + +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept +{ + return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); +} + +threshold::threshold get_thresholder(config const & config) +{ + size_t const first_sequence_size = [&]() + { + seqan3::sequence_file_input fin{config.query_path}; + auto & record = *fin.begin(); + return record.sequence().size(); + }(); + + return {threshold::threshold_parameters{.window_size = config.window_size, + .shape = seqan3::ungapped{config.kmer_size}, + .query_length = first_sequence_size, + .errors = config.errors}}; +} + +using seqfile_t = seqan3::sequence_file_input>; +using record_t = typename seqfile_t::record_type; + +std::vector ibf(config const & config, size_t & todo_bin_count) +{ + seqan::hibf::interleaved_bloom_filter ibf{}; + + { + std::ifstream os{config.input_path.string() + ".ibf", std::ios::binary}; + cereal::BinaryInputArchive iarchive{os}; + iarchive(ibf); + } + todo_bin_count = ibf.bin_count(); + + std::vector records = [&]() + { + std::vector result{}; + seqfile_t fin{config.query_path}; + std::ranges::move(fin, std::back_inserter(result)); + // Very fast, improves parallel processing when chunks of the query belong to the same bin. + std::ranges::shuffle(result, std::mt19937_64{0u}); + return result; + }(); + + std::vector hits(records.size()); + +#pragma omp parallel num_threads(config.threads) + { + auto agent = ibf.membership_agent(); + threshold::threshold const thresholder = get_thresholder(config); + auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{config.kmer_size}, + seqan3::window_size{config.window_size}, + seqan3::seed{adjust_seed(config.kmer_size)}); + std::vector hashes; + +#pragma omp for + for (size_t i = 0; i < records.size(); ++i) + { + auto & [id, seq] = records[i]; + auto view = seq | minimiser_view | std::views::common; + hashes.clear(); + hashes.assign(view.begin(), view.end()); + + auto & result = agent.membership_for(hashes, thresholder.get(hashes.size())); + hits[i].id = std::move(id); + std::ranges::copy(std::move(seq) + | std::views::transform( + [](auto const & in) + { + return seqan3::to_rank(in); + }), + std::back_inserter(hits[i].seq)); + std::ranges::copy(result, std::back_inserter(hits[i].bins)); + } + } + + for (auto & hit : hits) + { + std::cout << hit.id << ": "; + for (auto chr : hit.seq) + std::cout << static_cast(chr); + std::cout << "\n "; + for (auto bin : hit.bins) + std::cout << bin << ','; + std::cout << '\n'; + } + + return hits; +} + +} // namespace search diff --git a/src/search/search.cpp b/src/search/search.cpp new file mode 100644 index 0000000..42516b9 --- /dev/null +++ b/src/search/search.cpp @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include + +namespace search +{ + +void search(config const & config) +{ + size_t todo_bin_count{}; + std::vector hits = ibf(config, todo_bin_count); + fmindex(config, std::move(hits), todo_bin_count); +} + +} // namespace search diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6b0807a..3dc727f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause cmake_minimum_required (VERSION 3.25) @@ -11,7 +11,6 @@ include (test/config) # This includes `test/data/datasources.cmake`, which makes test data available to the tests. include (data/datasources.cmake) -add_app_test (api_fastq_coversion_test.cpp) -add_app_test (cli_fastq_coversion_test.cpp) +add_app_test (fpgalign_test.cpp) message (STATUS "You can run `make check` to build and run tests.") diff --git a/test/api_fastq_coversion_test.cpp b/test/api_fastq_coversion_test.cpp deleted file mode 100644 index 863e146..0000000 --- a/test/api_fastq_coversion_test.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include - -#include - -#include "app_test.hpp" - -// To prevent issues when running multiple API tests in parallel, give each API test unique names: -struct fastq_to_fasta : public app_test -{}; - -TEST_F(fastq_to_fasta, out_empty) -{ - std::string_view const expected{">seq1\nACGTTTGATTCGCG\n>seq2\nTCGGGGGATTCGCG\n"}; - - testing::internal::CaptureStdout(); - configuration const config{.fastq_input = data("in.fastq")}; - convert_fastq(config); - std::string const std_cout = testing::internal::GetCapturedStdout(); - - EXPECT_EQ(expected, std_cout); -} - -TEST_F(fastq_to_fasta, out_not_empty) -{ - std::filesystem::path const test_output = "out.fasta"; - configuration const config{.fastq_input = data("in.fastq"), .fasta_output = test_output}; - convert_fastq(config); // create out.fasta - - ASSERT_TRUE(std::filesystem::exists(test_output)); // check whether out.fasta exists - - // Check whether out.fasta is correct - std::string const expected = string_from_file(data("out.fasta")); - std::string const actual = string_from_file(test_output); - EXPECT_EQ(actual, expected); -} diff --git a/test/app_test.hpp b/test/app_test.hpp index 81b9a34..d70ac50 100644 --- a/test/app_test.hpp +++ b/test/app_test.hpp @@ -1,6 +1,6 @@ // SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin // SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 +// SPDX-License-Identifier: BSD-3-Clause #pragma once diff --git a/test/cli_fastq_coversion_test.cpp b/test/cli_fastq_coversion_test.cpp deleted file mode 100644 index 5b65e34..0000000 --- a/test/cli_fastq_coversion_test.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin -// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -// SPDX-License-Identifier: CC0-1.0 - -#include "app_test.hpp" - -// To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: -struct fastq_to_fasta : public app_test -{}; - -TEST_F(fastq_to_fasta, no_options) -{ - app_test_result const result = execute_app("FPGAlign"); - std::string_view const expected{"Fastq-to-Fasta-Converter\n" - "========================\n" - " Try -h or --help for more information.\n"}; - - EXPECT_SUCCESS(result); - EXPECT_EQ(result.out, expected); - EXPECT_EQ(result.err, ""); -} - -TEST_F(fastq_to_fasta, fail_no_argument) -{ - app_test_result const result = execute_app("FPGAlign", "-v"); - std::string_view const expected{"Parsing error. Not enough positional arguments provided (Need at least 1). " - "See -h/--help for more information.\n"}; - - EXPECT_FAILURE(result); - EXPECT_EQ(result.out, ""); - EXPECT_EQ(result.err, expected); -} - -TEST_F(fastq_to_fasta, with_argument) -{ - app_test_result const result = execute_app("FPGAlign", data("in.fastq")); - - EXPECT_SUCCESS(result); - EXPECT_EQ(result.out, ">seq1\nACGTTTGATTCGCG\n>seq2\nTCGGGGGATTCGCG\n"); - EXPECT_EQ(result.err, ""); -} - -TEST_F(fastq_to_fasta, with_argument_verbose) -{ - app_test_result const result = execute_app("FPGAlign", data("in.fastq"), "-v"); - - EXPECT_SUCCESS(result); - EXPECT_EQ(result.out, ">seq1\nACGTTTGATTCGCG\n>seq2\nTCGGGGGATTCGCG\n"); - EXPECT_EQ(result.err, "Conversion was a success. Congrats!\n"); -} - -TEST_F(fastq_to_fasta, with_out_file) -{ - app_test_result const result = execute_app("FPGAlign", data("in.fastq"), "-o", "out.fasta"); - std::string const expected = string_from_file(data("out.fasta")); - ASSERT_TRUE(std::filesystem::exists("out.fasta")); // check whether out.fasta exists - std::string const actual = string_from_file("out.fasta"); - - EXPECT_SUCCESS(result); - EXPECT_EQ(result.out, ""); - EXPECT_EQ(result.err, ""); - EXPECT_EQ(actual, expected); -} - -TEST_F(fastq_to_fasta, missing_path) -{ - app_test_result const result = execute_app("FPGAlign", data("in.fastq"), "-o", ""); - - EXPECT_FAILURE(result); - EXPECT_EQ(result.out, ""); - EXPECT_EQ(result.err, "Parsing error. Missing value for option -o\n"); -} - -TEST_F(fastq_to_fasta, invalid_path) -{ - app_test_result const result = execute_app("FPGAlign", data("in.fastq"), "-o", "does_not_exist/out.fasta"); - - EXPECT_FAILURE(result); - EXPECT_EQ(result.out, ""); - EXPECT_EQ(result.err, - "Parsing error. Validation failed for option -o/--output: Cannot write \"does_not_exist/out.fasta\"!\n"); -} diff --git a/test/data/REUSE.toml b/test/data/REUSE.toml index c9fd8dd..2512dff 100644 --- a/test/data/REUSE.toml +++ b/test/data/REUSE.toml @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause version = 1 diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 1e77c5e..c36c172 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin # SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik -# SPDX-License-Identifier: CC0-1.0 +# SPDX-License-Identifier: BSD-3-Clause # This includes `cmake/test/declare_datasource.cmake`, which provides the `declare_datasource` function. include (test/declare_datasource) @@ -27,7 +27,7 @@ include (test/add_local_data) # If you have the file locally, you can use `sha256sum` directly: # sha256sum downloaded.fasta # c3cb990ca1a25c7e31be3c6c2d009238d9ac9a44b2b7c143753c1e2881699077 downloaded.fasta -declare_datasource (FILE downloaded.fasta # This is a custom name. It does not have to match the file name. - URL https://ftp.seqan.de/app-template/downloaded.fasta - URL_HASH SHA256=c3cb990ca1a25c7e31be3c6c2d009238d9ac9a44b2b7c143753c1e2881699077 -) +# declare_datasource (FILE downloaded.fasta # This is a custom name. It does not have to match the file name. +# URL https://ftp.seqan.de/app-template/downloaded.fasta +# URL_HASH SHA256=c3cb990ca1a25c7e31be3c6c2d009238d9ac9a44b2b7c143753c1e2881699077 +# ) diff --git a/test/data/list.txt b/test/data/list.txt new file mode 100644 index 0000000..13425b0 --- /dev/null +++ b/test/data/list.txt @@ -0,0 +1,5 @@ +@data_dir@/in.fastq +@data_dir@/out.fasta +@data_dir@/in.fastq +@data_dir@/out.fasta +@data_dir@/in.fastq diff --git a/test/fpgalign_test.cpp b/test/fpgalign_test.cpp new file mode 100644 index 0000000..9be91ad --- /dev/null +++ b/test/fpgalign_test.cpp @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include "app_test.hpp" + +// To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: +struct fpgalign : public app_test +{}; + +TEST_F(fpgalign, no_options) +{}