diff --git a/CMakeLists.txt b/CMakeLists.txt index 7469c20..849df58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,22 +92,43 @@ if (UNIX) endif() -#add_subdirectory(external/zlib-cloudflare) include(ExternalProject) -ExternalProject_Add(zlib-cloudflare - GIT_REPOSITORY https://github.com/cloudflare/zlib - GIT_TAG 7aa510344e06fecd6fe09195ac22e9a424ceb660 - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/zlib-cloudflare - SOURCE_DIR ${CMAKE_SOURCE_DIR}/external/zlib-cloudflare +# Define libz-ng as an external project +ExternalProject_Add( + zlib-ng + GIT_REPOSITORY https://github.com/zlib-ng/zlib-ng.git + GIT_TAG 2.3.2 # Use a specific version tag for reproducibility + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX= + -DBUILD_SHARED_LIBS=OFF # Build static library + -DZLIB_COMPAT=ON # Disable compatibility mode to get libz-ng.a name + -DZLIB_ENABLE_TESTS=OFF # Disable tests + BUILD_BYPRODUCTS /lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Installing zlib-ng..." + COMMAND ${CMAKE_COMMAND} --build --target install + COMMAND ${CMAKE_COMMAND} -E copy + /lib/${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX} + /lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + ) -ExternalProject_Add_Step(zlib-cloudflare - rename_lib - COMMAND cp ${CMAKE_BINARY_DIR}/zlib-cloudflare/lib/libz.a ${CMAKE_BINARY_DIR}/zlib-cloudflare/lib/libzcf.a - COMMENT "copying cloudflare libz.a to libzcf.a" - DEPENDEES install +# Get the install directory +ExternalProject_Get_Property(zlib-ng INSTALL_DIR) + +# Create the include directory at configure time to avoid CMake errors +file(MAKE_DIRECTORY ${INSTALL_DIR}/include) + +# Create an imported target for libz-ng +add_library(zlibng STATIC IMPORTED) +set_target_properties(zlibng PROPERTIES + IMPORTED_LOCATION ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include ) +# Ensure the external project is built before using the imported target +add_dependencies(zlibng zlib-ng) + ## can't reply on find_package because the ExternalProject_Add # steps won't complete by compile time, when the find_package # looks for the ZLIB @@ -117,11 +138,11 @@ ExternalProject_Add_Step(zlib-cloudflare # install path to the lib / include directories of the # cloudflare libz -include_directories(${CMAKE_BINARY_DIR}/zlib-cloudflare/include) -link_directories(${CMAKE_BINARY_DIR}/zlib-cloudflare/lib) +#include_directories(${CMAKE_BINARY_DIR}/zlib-cloudflare/include) +#link_directories(${CMAKE_BINARY_DIR}/zlib-cloudflare/lib) # direct path to the cloudflare libz -set(CLOUDFLARE_ZLIB ${CMAKE_BINARY_DIR}/zlib-cloudflare/lib/libzcf.a) +#set(CLOUDFLARE_ZLIB ${CMAKE_BINARY_DIR}/zlib-cloudflare/lib/libzcf.a) set(THREADS_PREFER_PTHREAD_FLAG TRUE) @@ -176,7 +197,7 @@ target_include_directories(build_static PRIVATE ${ZLIB_INCLUDE_DIRS} ${CMAKE_CU #target_include_directories(bench PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${LIBRADICL_INCLUDE}) add_executable(check src/check.cpp src/FastxParser.cpp) -target_link_libraries(check ${CLOUDFLARE_ZLIB} sshash_static Threads::Threads) +target_link_libraries(check zlibng sshash_static Threads::Threads) target_include_directories(check PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS} ${LIBRADICL_INCLUDE}) #add_executable(query src/query.cpp ${SSHASH_SOURCES} ${Z_LIB_SOURCES}) @@ -189,15 +210,15 @@ target_include_directories(check PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${Z add_executable(pesc-sc src/pesc_sc_runner.cpp) target_include_directories(pesc-sc PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS} ${LIBRADICL_INCLUDE}) -target_link_libraries(pesc-sc pesc_static ${CLOUDFLARE_ZLIB} Threads::Threads sshash_static ${MALLOC_LIB}) +target_link_libraries(pesc-sc pesc_static zlibng Threads::Threads sshash_static ${MALLOC_LIB}) add_executable(pesc-bulk src/pesc_bulk_runner.cpp) target_include_directories(pesc-bulk PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS} ${LIBRADICL_INCLUDE}) -target_link_libraries(pesc-bulk pesc_static ${CLOUDFLARE_ZLIB} Threads::Threads sshash_static ${MALLOC_LIB}) +target_link_libraries(pesc-bulk pesc_static zlibng Threads::Threads sshash_static ${MALLOC_LIB}) add_executable(pesc-sc-atac src/pesc_sc_atac_runner.cpp) target_include_directories(pesc-sc-atac PUBLIC ${LIBRADICL_INCLUDE} ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS}) -target_link_libraries(pesc-sc-atac pesc_static radicl ${CLOUDFLARE_ZLIB} Threads::Threads sshash_static ${MALLOC_LIB}) +target_link_libraries(pesc-sc-atac pesc_static radicl zlibng Threads::Threads sshash_static ${MALLOC_LIB}) #target_link_directories(pesc-sc-atac PUBLIC /fs/cbcb-lab/rob/students/noor/miniforge3/lib) add_dependencies(pesc-sc-atac radicl) @@ -213,19 +234,19 @@ add_dependencies(pesc-sc-atac radicl) add_executable(build src/build_runner.cpp) target_include_directories(build PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS} ${LIBRADICL_INCLUDE}) -target_link_libraries(build Threads::Threads build_static sshash_static ${CLOUDFLARE_ZLIB} ${MALLOC_LIB}) +target_link_libraries(build Threads::Threads build_static sshash_static zlibng ${MALLOC_LIB}) add_executable(evaluator src/index_evaluator.cpp) target_include_directories(evaluator PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS} ${LIBRADICL_INCLUDE}) -target_link_libraries(evaluator ${CLOUDFLARE_ZLIB} Threads::Threads sshash_static) +target_link_libraries(evaluator zlibng Threads::Threads sshash_static) add_executable(build-poison-table src/poison_table_build_runner.cpp) target_include_directories(build-poison-table PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS}) -target_link_libraries(build-poison-table Threads::Threads build_static pesc_static sshash_static ${CLOUDFLARE_ZLIB} ${MALLOC_LIB}) +target_link_libraries(build-poison-table Threads::Threads build_static pesc_static sshash_static zlibng ${MALLOC_LIB}) add_executable(poison_filter src/poison_read_filter.cpp) target_include_directories(poison_filter PUBLIC ${CMAKE_SOURCE_DIR}/include ${ZLIB_INCLUDE_DIRS}) -target_link_libraries(poison_filter Threads::Threads pesc_static sshash_static ${CLOUDFLARE_ZLIB}) +target_link_libraries(poison_filter Threads::Threads pesc_static sshash_static zlibng) ## tests add_subdirectory(external/doctest) @@ -238,14 +259,14 @@ target_link_libraries(tests PRIVATE doctest pesc_static) ## depend on the cloudflare zlib library -add_dependencies(check zlib-cloudflare) -add_dependencies(evaluator zlib-cloudflare) -add_dependencies(build zlib-cloudflare) -add_dependencies(pesc-sc zlib-cloudflare) -add_dependencies(pesc-sc-atac zlib-cloudflare) -add_dependencies(pesc-bulk zlib-cloudflare) -add_dependencies(build-poison-table zlib-cloudflare) -add_dependencies(poison_filter zlib-cloudflare) +add_dependencies(check zlibng) +add_dependencies(evaluator zlibng) +add_dependencies(build zlibng) +add_dependencies(pesc-sc zlibng) +add_dependencies(pesc-sc-atac zlibng) +add_dependencies(pesc-bulk zlibng) +add_dependencies(build-poison-table zlibng) +add_dependencies(poison_filter zlibng) #add_executable(test_parse src/test_parse_geo.cpp) @@ -263,14 +284,21 @@ install(TARGETS pesc_static build_static sshash_static CONFIGURATIONS Debug RUNTIME DESTINATION Debug/lib) -install(FILES ${CLOUDFLARE_ZLIB} - CONFIGURATIONS Debug - RUNTIME DESTINATION Debug/lib) +install(FILES ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + CONFIGURATIONS Debug + RUNTIME DESTINATION Debug/lib +) install(TARGETS pesc_static build_static sshash_static CONFIGURATIONS Release RUNTIME DESTINATION Release/lib) -install(FILES ${CLOUDFLARE_ZLIB} - CONFIGURATIONS Release - RUNTIME DESTINATION Release/lib) +install(FILES ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + CONFIGURATIONS Release + RUNTIME DESTINATION Release/lib +) + +install(FILES ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z-ng${CMAKE_STATIC_LIBRARY_SUFFIX} + DESTINATION lib + COMPONENT libraries +) diff --git a/external/libradicl b/external/libradicl index dd44f3b..2666a1f 160000 --- a/external/libradicl +++ b/external/libradicl @@ -1 +1 @@ -Subproject commit dd44f3b515bb728301b9a964ea6460ae01e2a720 +Subproject commit 2666a1f9e55c96ebdd65e507627b4384eebdc784 diff --git a/include/rad/util.hpp b/include/rad/util.hpp index a9eb253..b496db0 100644 --- a/include/rad/util.hpp +++ b/include/rad/util.hpp @@ -2,20 +2,21 @@ #define __RAD_UTIL_HPP__ #include +#include +#include -#include "../parallel_hashmap/phmap.h" +#include "../Kmer.hpp" #include "../mapping/utils.hpp" #include "../mapping/utils_bin.hpp" -#include "../Kmer.hpp" +#include "../parallel_hashmap/phmap.h" #include "../reference_index.hpp" #include "rad_header.hpp" #include "rad_writer.hpp" -#include "../../external/libradicl/include/RAD_Writer.hpp" #include "../../external/libradicl/include/Alignment_Record.hpp" -#include "../../external/libradicl/include/Read_Record.hpp" #include "../../external/libradicl/include/Byte_Array.hpp" -#include "../../external/libradicl/include/Tags.hpp" +#include "../../external/libradicl/include/RAD_Writer.hpp" +#include "../../external/libradicl/include/Read_Record.hpp" #include "../../external/libradicl/include/Tags.hpp" namespace rad { @@ -24,8 +25,10 @@ namespace util { using umi_kmer_t = combinelib::kmers::Kmer<31, 2>; using bc_kmer_t = combinelib::kmers::Kmer<31, 3>; -inline size_t write_rad_header(mindex::reference_index &ri, size_t bc_length, - size_t umi_length, std::ofstream &rad_file) { +inline std::pair> +write_rad_header(mindex::reference_index &ri, size_t bc_length, + size_t umi_length, bool with_position, + std::ofstream &rad_file) { rad_writer bw; // RADHeader rad_header rh; @@ -46,7 +49,7 @@ inline size_t write_rad_header(mindex::reference_index &ri, size_t bc_length, // write the tag meta-information section // File-level tag description - uint16_t file_level_tags{2}; + uint16_t file_level_tags = with_position ? 4 : 3; bw << file_level_tags; // cblen @@ -57,6 +60,15 @@ inline size_t write_rad_header(mindex::reference_index &ri, size_t bc_length, bw << std::string("ulen"); bw << type_id; + bw << std::string("known_rad_type"); + bw << static_cast(8); + + if (with_position) { + // rlen + bw << std::string("rlen"); + bw << static_cast(3); // use u32 for read length + } + // read-level tag description uint16_t read_level_tags{2}; bw << read_level_tags; @@ -88,27 +100,44 @@ inline size_t write_rad_header(mindex::reference_index &ri, size_t bc_length, bw << type_id; // alignment-level tag description - uint16_t aln_level_tags{1}; + uint16_t aln_level_tags = with_position ? 2 : 1; bw << aln_level_tags; // we maintain orientation // bw << std::string("orientation"); // type_id = 1; // bw << type_id; - // and reference id + // tag 1: compressed orientation and reference id (u32) bw << std::string("compressed_ori_refid"); type_id = 3; bw << type_id; + if (with_position) { + bw << std::string("pos"); + type_id = 3; + bw << type_id; + } + // ### end of tag definitions // the actual file-level tags bw << static_cast(bc_length); bw << static_cast(umi_length); + std::string rad_type = with_position ? "sc_rna_pos" : "sc_rna_basic"; + bw << rad_type; + + // Save offset where read_length will be written (as placeholder with 0) + std::optional read_length_offset = + with_position ? std::make_optional(bw.num_bytes()) : std::nullopt; + if (with_position) { + // Write 0 as placeholder for read_length - will be updated later + bw << static_cast(0); + } + rad_file << bw; bw.clear(); - return chunk_offset; + return {chunk_offset, read_length_offset}; } inline size_t write_rad_header_bulk(mindex::reference_index &ri, bool is_paired, @@ -132,10 +161,12 @@ inline size_t write_rad_header_bulk(mindex::reference_index &ri, bool is_paired, // write the tag meta-information section // File-level tag description - // none right now - uint16_t file_level_tags{1}; + uint16_t file_level_tags{2}; bw << file_level_tags; + bw << std::string("known_rad_type"); + bw << static_cast(8); + bw << std::string("ref_lengths"); uint8_t type_id{7}; // type is array bw << type_id; @@ -177,6 +208,9 @@ inline size_t write_rad_header_bulk(mindex::reference_index &ri, bool is_paired, // ### end of tag definitions + std::string rad_type = "bulk_with_pos"; + bw << rad_type; + // the actual file-level tag // we've already recorded the description // so here, we give the length and the the @@ -194,22 +228,28 @@ inline size_t write_rad_header_bulk(mindex::reference_index &ri, bool is_paired, return chunk_offset; } -inline void write_rad_header_atac(mindex::reference_index& ri, std::vector& refs, RAD::Tag_Defn& tag_defn) { - - for (size_t i = 0; i < ri.num_refs(); ++i) { refs.emplace_back(ri.ref_name(i)); } +inline void write_rad_header_atac(mindex::reference_index &ri, + std::vector &refs, + RAD::Tag_Defn &tag_defn) { - tag_defn.add_file_tag("cblen"); - tag_defn.add_file_tag("ref_lengths"); + for (size_t i = 0; i < ri.num_refs(); ++i) { + refs.emplace_back(ri.ref_name(i)); + } + + tag_defn.add_file_tag("cblen"); + tag_defn.add_file_tag("known_rad_type"); + tag_defn.add_file_tag("ref_lengths"); - tag_defn.add_read_tag("barcode"); + tag_defn.add_read_tag("barcode"); - tag_defn.add_aln_tag("ref"); - tag_defn.add_aln_tag("type"); - tag_defn.add_aln_tag("start_pos"); - tag_defn.add_aln_tag("frag_len"); + tag_defn.add_aln_tag("ref"); + tag_defn.add_aln_tag("type"); + tag_defn.add_aln_tag("start_pos"); + tag_defn.add_aln_tag("frag_len"); } -inline void write_to_rad_stream(bc_kmer_t &bck, umi_kmer_t &umi, +inline void +write_to_rad_stream(bc_kmer_t &bck, umi_kmer_t &umi, bool with_position, mapping::util::MappingType map_type, std::vector &accepted_hits, phmap::flat_hash_map &unmapped_bc_map, @@ -282,12 +322,20 @@ inline void write_to_rad_stream(bc_kmer_t &bck, umi_kmer_t &umi, // NOTE: should not happen! break; } - bw << (aln.tid | fw_mask); + uint32_t compressed_ori_refid = (aln.tid | fw_mask); + bw << compressed_ori_refid; + + if (with_position) { + // Add pos tag as u32 + uint32_t pos_u32 = static_cast(aln.pos); + bw << pos_u32; + } } ++num_reads_in_chunk; } -inline void write_to_rad_stream_bulk(mapping::util::MappingType map_type, +inline void +write_to_rad_stream_bulk(mapping::util::MappingType map_type, std::vector &accepted_hits, uint32_t &num_reads_in_chunk, rad_writer &bw) { if (map_type == mapping::util::MappingType::UNMAPPED) { @@ -353,115 +401,118 @@ inline void write_to_rad_stream_bulk(mapping::util::MappingType map_type, ++num_reads_in_chunk; } +inline void write_to_rad_stream_atac( + bc_kmer_t &bck, mapping::util::MappingType map_type, + std::vector &accepted_hits, + phmap::flat_hash_map &unmapped_bc_map, + uint32_t &num_reads_in_chunk, std::optional &strbuff, std::string &barcode, + mindex::reference_index &ri, RAD::RAD_Writer &rw, RAD::Token &token, + bool tn5_shift) { + if (map_type == mapping::util::MappingType::UNMAPPED) { + unmapped_bc_map[bck.word(0)] += 1; + // do nothing here + return; + } + RAD::Read read_rec; -inline void write_to_rad_stream_atac(bc_kmer_t& bck, mapping::util::MappingType map_type, - std::vector& accepted_hits, - phmap::flat_hash_map& unmapped_bc_map, - uint32_t& num_reads_in_chunk, std::string& strbuff, - std::string& barcode, mindex::reference_index& ri, - RAD::RAD_Writer& rw, RAD::Token& token, bool tn5_shift) { - - if (map_type == mapping::util::MappingType::UNMAPPED) { - unmapped_bc_map[bck.word(0)] += 1; - // do nothing here - return; - } - RAD::Read read_rec; - - read_rec.set(accepted_hits.size()); - - const uint32_t barcode_len = bc_kmer_t::k(); - if (barcode_len <= 32) { - if (barcode_len <= 16) { // can use 32-bit int - uint32_t shortbck = static_cast(0x00000000FFFFFFFF & bck.word(0)); - read_rec.add_tag(RAD::Type::u32(shortbck)); - } else { // must use 64-bit int - read_rec.add_tag(RAD::Type::u64(bck.word(0))); - } - } else { - std::cerr << "should not happen\n"; + read_rec.set(accepted_hits.size()); + + const uint32_t barcode_len = bc_kmer_t::k(); + if (barcode_len <= 32) { + if (barcode_len <= 16) { // can use 32-bit int + uint32_t shortbck = + static_cast(0x00000000FFFFFFFF & bck.word(0)); + read_rec.add_tag(RAD::Type::u32(shortbck)); + } else { // must use 64-bit int + read_rec.add_tag(RAD::Type::u64(bck.word(0))); } - for (auto& aln : accepted_hits) { - RAD::Aln_Record aln_rec; - uint8_t type{0}; - // top 2 bits are fw,rc ori - // uint32_t fw_mask = aln.is_fw ? 0x80000000 : 0x00000000; - // uint32_t mate_fw_mask = aln.mate_is_fw ? 0x40000000 : 0x00000000; - // bottom 30 bits are target id - // strbuff += std::to_string((0x3FFFFFFF & aln.tid) | fw_mask | mate_fw_mask); - strbuff += ri.ref_name(aln.tid); - strbuff += "\t"; - int32_t leftmost_pos = 0; - // placeholder value for no fragment length - uint16_t frag_len = std::numeric_limits::max(); - - switch (map_type) { - case mapping::util::MappingType::SINGLE_MAPPED: - // then the posittion must be that of the only - // mapped read. - leftmost_pos = std::max(0, aln.pos); - type = 1; - break; - case mapping::util::MappingType::MAPPED_FIRST_ORPHAN: - leftmost_pos = std::max(0, aln.pos); - type = 2; - break; - case mapping::util::MappingType::MAPPED_SECOND_ORPHAN: - // it's not mate pos b/c in this case we - // simply returned the right accepted hits - // as the accepted hits - leftmost_pos = std::max(0, aln.pos); - type = 3; - break; - case mapping::util::MappingType::MAPPED_PAIR: - // if we actually have a paird fragment get the - // leftmost position - leftmost_pos = std::min(aln.pos, aln.mate_pos); - frag_len = aln.frag_len(); - type = 4; - // if the leftmost position is < 0, then adjust - // the overhang by setting the start position to 0 - // and subtracting the overhang from the fragment - // length. - if (leftmost_pos < 0) { - frag_len = aln.frag_len() + leftmost_pos; - leftmost_pos = 0; - } - break; - case mapping::util::MappingType::UNMAPPED: - type = 8; - // don't do anything here - break; - } - if (tn5_shift) { - leftmost_pos += 4; - frag_len -= 9; - } - aln_rec.clear(); - aln_rec.add_tag(RAD::Type::u32(aln.tid)); - aln_rec.add_tag(RAD::Type::u8(type)); - aln_rec.add_tag(RAD::Type::u32(leftmost_pos)); - aln_rec.add_tag(RAD::Type::u16(frag_len)); - read_rec.add_aln_rec(aln_rec); - - strbuff += std::to_string(leftmost_pos); - strbuff += "\t"; - strbuff += std::to_string(leftmost_pos + frag_len); - strbuff += "\t"; - strbuff += barcode; - strbuff += "\t"; - strbuff += std::to_string(accepted_hits.size()); - strbuff += "\n"; + } else { + std::cerr << "should not happen\n"; + } + for (auto &aln : accepted_hits) { + RAD::Aln_Record aln_rec; + uint8_t type{0}; + // top 2 bits are fw,rc ori + // uint32_t fw_mask = aln.is_fw ? 0x80000000 : 0x00000000; + // uint32_t mate_fw_mask = aln.mate_is_fw ? 0x40000000 : 0x00000000; + // bottom 30 bits are target id + // strbuff += std::to_string((0x3FFFFFFF & aln.tid) | fw_mask | + // mate_fw_mask); + if (strbuff) { + *strbuff += ri.ref_name(aln.tid); + *strbuff += "\t"; } - - rw.add(read_rec, token); - ++num_reads_in_chunk; -} + int32_t leftmost_pos = 0; + // placeholder value for no fragment length + uint16_t frag_len = std::numeric_limits::max(); + switch (map_type) { + case mapping::util::MappingType::SINGLE_MAPPED: + // then the posittion must be that of the only + // mapped read. + leftmost_pos = std::max(0, aln.pos); + type = 1; + break; + case mapping::util::MappingType::MAPPED_FIRST_ORPHAN: + leftmost_pos = std::max(0, aln.pos); + type = 2; + break; + case mapping::util::MappingType::MAPPED_SECOND_ORPHAN: + // it's not mate pos b/c in this case we + // simply returned the right accepted hits + // as the accepted hits + leftmost_pos = std::max(0, aln.pos); + type = 3; + break; + case mapping::util::MappingType::MAPPED_PAIR: + // if we actually have a paird fragment get the + // leftmost position + leftmost_pos = std::min(aln.pos, aln.mate_pos); + frag_len = aln.frag_len(); + type = 4; + // if the leftmost position is < 0, then adjust + // the overhang by setting the start position to 0 + // and subtracting the overhang from the fragment + // length. + if (leftmost_pos < 0) { + frag_len = aln.frag_len() + leftmost_pos; + leftmost_pos = 0; + } + break; + case mapping::util::MappingType::UNMAPPED: + type = 8; + // don't do anything here + break; + } + if (tn5_shift) { + leftmost_pos += 4; + frag_len -= 9; + } + aln_rec.clear(); + aln_rec.add_tag(RAD::Type::u32(aln.tid)); + aln_rec.add_tag(RAD::Type::u8(type)); + aln_rec.add_tag(RAD::Type::u32(leftmost_pos)); + aln_rec.add_tag(RAD::Type::u16(frag_len)); + read_rec.add_aln_rec(aln_rec); + + if (strbuff) { + *strbuff += std::to_string(leftmost_pos); + *strbuff += "\t"; + *strbuff += std::to_string(leftmost_pos + frag_len); + *strbuff += "\t"; + *strbuff += barcode; + *strbuff += "\t"; + *strbuff += std::to_string(accepted_hits.size()); + *strbuff += "\n"; + } + } -} // namespace util -} // namespace rad + rw.add(read_rec, token); + ++num_reads_in_chunk; +} +} // namespace util +} // namespace rad -#endif //__RAD_UTIL_HPP__ +#endif //__RAD_UTIL_HPP__ diff --git a/src/pesc_sc.cpp b/src/pesc_sc.cpp index e94e000..9cc07d1 100644 --- a/src/pesc_sc.cpp +++ b/src/pesc_sc.cpp @@ -5,6 +5,7 @@ #include "../include/cli11/CLI11.hpp" #include "../include/defaults.hpp" #include "../include/ghc/filesystem.hpp" +#include "../include/itlib/small_vector.hpp" #include "../include/mapping/utils.hpp" #include "../include/meta_info.hpp" #include "../include/parallel_hashmap/phmap.h" @@ -21,9 +22,12 @@ #include "../include/util_piscem.hpp" #include "zlib.h" +#include +#include #include #include #include +#include #include #include #include @@ -31,6 +35,7 @@ #include #include #include +#include #include using namespace klibpp; @@ -60,6 +65,7 @@ struct pesc_sc_options { std::string library_geometry; protocol_t pt{protocol_t::CUSTOM}; std::unique_ptr p{nullptr}; + bool with_position{false}; bool no_poison{false}; bool quiet{false}; bool enable_structural_constraints{false}; @@ -96,6 +102,11 @@ class pesc_output_info { // the mutex for safely writing to // unmapped_bc_file std::mutex unmapped_bc_mutex; + + // Collect read lengths during processing, validate after all loops + std::mutex read_length_mutex; + std::vector collected_read_lengths; + // std::atomic has_collected_3_read_lengths{false}; }; // single-end @@ -240,21 +251,27 @@ void do_map(mindex::reference_index &ri, (void)num_short_umi; (void)num_ambig_umi; - mapping_cache_info> map_cache_left(ri); + constexpr size_t num_local_samples = 10; + itlib::small_vector local_read_lengths; + + mapping_cache_info> map_cache_left( + ri); map_cache_left.max_ec_card = po.max_ec_card; map_cache_left.max_hit_occ = po.max_hit_occ; map_cache_left.max_hit_occ_recover = po.max_hit_occ_recover; map_cache_left.max_read_occ = po.max_read_occ; map_cache_left.attempt_occ_recover = po.attempt_occ_recover; - mapping_cache_info> map_cache_right(ri); + mapping_cache_info> + map_cache_right(ri); map_cache_right.max_ec_card = po.max_ec_card; map_cache_right.max_hit_occ = po.max_hit_occ; map_cache_right.max_hit_occ_recover = po.max_hit_occ_recover; map_cache_right.max_read_occ = po.max_read_occ; map_cache_right.attempt_occ_recover = po.attempt_occ_recover; - mapping_cache_info> map_cache_out(ri); + mapping_cache_info> map_cache_out( + ri); map_cache_out.max_ec_card = po.max_ec_card; map_cache_out.max_hit_occ = po.max_hit_occ; map_cache_out.max_hit_occ_recover = po.max_hit_occ_recover; @@ -332,6 +349,18 @@ void do_map(mindex::reference_index &ri, AlignableReadSeqs read_seqs = protocol.get_mappable_read_sequences( record.first.seq, record.second.seq); + // Collect read length (only if we haven't collected enough valid lengths + // yet) + if (po.with_position and local_read_lengths.size() < num_local_samples) { + std::string *alignable_seq = read_seqs.get_alignable_seq(); + if (alignable_seq != nullptr) { + uint16_t read_length = static_cast(alignable_seq->length()); + if (read_length > 0) { + local_read_lengths.push_back(read_length); + } + } + } + bool had_early_stop = false; // dispatch on the *compile-time determined* paired-endness of this // protocol. @@ -355,8 +384,9 @@ void do_map(mindex::reference_index &ri, global_nhits += map_cache_out.accepted_hits.empty() ? 0 : 1; rad::util::write_to_rad_stream( - bc_kmer, umi_kmer, map_cache_out.map_type, map_cache_out.accepted_hits, - map_cache_out.unmapped_bc_map, num_reads_in_chunk, rad_w); + bc_kmer, umi_kmer, po.with_position, map_cache_out.map_type, + map_cache_out.accepted_hits, map_cache_out.unmapped_bc_map, + num_reads_in_chunk, rad_w); // dump buffer if (num_reads_in_chunk > max_chunk_reads) { @@ -390,6 +420,14 @@ void do_map(mindex::reference_index &ri, num_reads_in_chunk = 0; } + if (po.with_position) { + out_info.read_length_mutex.lock(); + for (auto rl : local_read_lengths) { + out_info.collected_read_lengths.push_back(rl); + } + out_info.read_length_mutex.unlock(); + } + // unmapped barcode writer { // make a scope and dump the unmapped barcode counts rad_writer ubcw; @@ -547,6 +585,9 @@ int run_pesc_sc(int argc, char **argv) { .add_option("-t,--threads", po.nthread, "An integer that specifies the number of threads to use") ->default_val(16); + app.add_flag("--with-position", po.with_position, + "Include information about the position information of each " + "mapped read in the output RAD file"); app.add_flag( "--no-poison", po.no_poison, "Do not filter reads for poison k-mers, even if a poison table " @@ -693,13 +734,13 @@ int run_pesc_sc(int argc, char **argv) { size_t bc_length = bc_kmer_t::k(); size_t umi_length = umi_kmer_t::k(); - size_t chunk_offset = - rad::util::write_rad_header(ri, bc_length, umi_length, out_info.rad_file); + auto [chunk_offset, read_length_offset] = rad::util::write_rad_header( + ri, bc_length, umi_length, po.with_position, out_info.rad_file); std::mutex iomut; uint32_t np = 1; - + auto num_input_files = po.left_read_filenames.size(); size_t additional_files = (num_input_files > 1) ? (num_input_files - 1) : 0; @@ -822,6 +863,54 @@ int run_pesc_sc(int argc, char **argv) { uint64_t nc = out_info.num_chunks.load(); out_info.rad_file.write(reinterpret_cast(&nc), sizeof(nc)); + if (po.with_position) { + uint32_t final_read_length = 0; + std::unordered_map freq_map; + size_t tot = 0; + for (auto &l : out_info.collected_read_lengths) { + freq_map[l] += 1; + tot++; + } + + uint32_t most_frequent_len = 0; + size_t frequency = 0; + for (auto &kv : freq_map) { + if (kv.second > frequency) { + frequency = kv.second; + most_frequent_len = kv.first; + } + } + + double most_frequent_ratio = + (tot > 0) ? frequency / static_cast(tot) : 0.0; + + // Validate and update read_length in header (after all loops, before + // writing) + if (most_frequent_ratio >= 0.9) { + final_read_length = most_frequent_len; + spdlog_piscem::info("Found validated read length: {}", + final_read_length); + } else { + std::stringstream sstr; + for (auto &v : out_info.collected_read_lengths) { + sstr << v << ", "; + } + std::string vec_str = sstr.str(); + if (!vec_str.empty()) { + vec_str.pop_back(); + } + if (!vec_str.empty()) { + vec_str.pop_back(); + } + spdlog_piscem::warn("Collected read lengths are too variable: [{}]", + vec_str); + } + if (final_read_length > 0) { + out_info.rad_file.seekp(*read_length_offset); + out_info.rad_file.write(reinterpret_cast(&final_read_length), + sizeof(final_read_length)); + } + } out_info.rad_file.close(); // We want to check if the RAD file stream was written to diff --git a/src/pesc_sc_atac.cpp b/src/pesc_sc_atac.cpp index 5fd2249..c8e8803 100644 --- a/src/pesc_sc_atac.cpp +++ b/src/pesc_sc_atac.cpp @@ -3,6 +3,7 @@ #include "../include/CanonicalKmerIterator.hpp" #include "../include/FastxParser.hpp" #include "../include/Kmer.hpp" +#include "../include/boost/unordered/concurrent_flat_map.hpp" #include "../include/cli11/CLI11.hpp" #include "../include/ghc/filesystem.hpp" #include "../include/mapping/utils.hpp" @@ -19,7 +20,6 @@ #include "../include/spdlog_piscem/spdlog.h" #include "../include/streaming_query.hpp" #include "../include/util_piscem.hpp" -#include "../include/boost/unordered/concurrent_flat_map.hpp" #include "check_overlap.cpp" // #include "FastxParser.cpp" // #include "hit_searcher.cpp" @@ -621,9 +621,8 @@ void do_map(mindex::reference_index &ri, std::atomic &r_orphan, std::atomic &l_orphan, std::atomic &global_npoisoned, pesc_output_info &out_info, std::mutex &iomut, bool write_bed, bool check_kmers_orphans, - bool tn5_shift, bool use_chr, - piscem::unitig_end_cache_t& unitig_end_cache, - RAD::RAD_Writer &rw, + bool tn5_shift, bool use_chr, + piscem::unitig_end_cache_t &unitig_end_cache, RAD::RAD_Writer &rw, RAD::Token token) { auto log_level = spdlog_piscem::get_level(); @@ -674,9 +673,12 @@ void do_map(mindex::reference_index &ri, std::string workstr_right; std::ostringstream osstream; - mapping_cache_info> map_cache_left(ri, &unitig_end_cache); - mapping_cache_info> map_cache_right(ri, &unitig_end_cache); - mapping_cache_info> map_cache_out(ri, &unitig_end_cache); + mapping_cache_info> map_cache_left( + ri, &unitig_end_cache); + mapping_cache_info> map_cache_right( + ri, &unitig_end_cache); + mapping_cache_info> map_cache_out( + ri, &unitig_end_cache); size_t max_chunk_reads = 5000; @@ -692,7 +694,7 @@ void do_map(mindex::reference_index &ri, uint64_t read_num = 0; (void)read_num; - std::string temp_buff = ""; + std::optional temp_buff = write_bed ? std::make_optional("") : std::nullopt; while (parser.refill(rg)) { for (auto &record : rg) { ++global_nr; @@ -780,11 +782,11 @@ void do_map(mindex::reference_index &ri, if (num_reads_in_chunk > max_chunk_reads) { if (write_bed) { out_info.bed_mutex.lock(); - out_info.bed_file << temp_buff; + out_info.bed_file << *temp_buff; out_info.bed_mutex.unlock(); + *temp_buff = ""; } out_info.num_chunks++; - temp_buff = ""; num_reads_in_chunk = 0; } } @@ -793,11 +795,11 @@ void do_map(mindex::reference_index &ri, if (num_reads_in_chunk > 0) { if (write_bed) { out_info.bed_mutex.lock(); - out_info.bed_file << temp_buff; + out_info.bed_file << *temp_buff; out_info.bed_mutex.unlock(); + *temp_buff = ""; } out_info.num_chunks++; - temp_buff = ""; num_reads_in_chunk = 0; } @@ -917,7 +919,10 @@ int run_pesc_sc_atac(int argc, char **argv) { ->default_val("permissive"); app.add_flag("--quiet", po.quiet, "Try to be quiet in terms of console output"); - app.add_option("--end-cache-capacity", po.end_cache_capacity, "maximum capcity of the unitig end cache")->default_val(5000000); + app + .add_option("--end-cache-capacity", po.end_cache_capacity, + "maximum capcity of the unitig end cache") + ->default_val(5000000); app.add_option("--thr", po.thr, "threshold for psa")->default_val(0.7); app.add_option("--bclen", po.blen, "length for barcode")->default_val(16); app.add_option("--bin-size", po.bin_size, "size for binning") @@ -1020,6 +1025,7 @@ int run_pesc_sc_atac(int argc, char **argv) { RAD::Tag_Defn tag_defn; RAD::Tag_List file_tag_vals; file_tag_vals.add(RAD::Type::u16(po.blen)); + file_tag_vals.add(RAD::Type::str("sc_atac")); std::vector len; len.reserve(ri.num_refs()); for (decltype(ri.num_refs()) i = 0; i < ri.num_refs(); i++) { @@ -1072,7 +1078,7 @@ int run_pesc_sc_atac(int argc, char **argv) { if (paired_end) { using FragmentT = fastx_parser::ReadTrip; - + auto num_input_files = po.left_read_filenames.size(); size_t additional_files = (num_input_files > 1) ? (num_input_files - 1) : 0; @@ -1098,46 +1104,49 @@ int run_pesc_sc_atac(int argc, char **argv) { piscem::unitig_end_cache_t unitig_end_cache(po.end_cache_capacity); std::vector workers; for (size_t i = 0; i < nthread; ++i) { - workers.push_back(std::thread([&ri, &po, &rparser, &binning, &ptab, - &global_nr, &global_nh, &global_nmult, - &k_match, &global_np, &out_info, &iomut, - &rw, &l_match, &r_match, &dove_match, - &dove_num, &ov_num, &ov_match, &r_orphan, - &l_orphan, &unitig_end_cache]() { - const auto token = rw.get_token(); - if (!po.enable_structural_constraints) { - using SketchHitT = - mapping::util::sketch_hit_info_no_struct_constraint; - if (po.use_sam_format) { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); - } else { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); - } - } else { - using SketchHitT = mapping::util::sketch_hit_info; - if (po.use_sam_format) { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); + workers.push_back(std::thread( + [&ri, &po, &rparser, &binning, &ptab, &global_nr, &global_nh, + &global_nmult, &k_match, &global_np, &out_info, &iomut, &rw, &l_match, + &r_match, &dove_match, &dove_num, &ov_num, &ov_match, &r_orphan, + &l_orphan, &unitig_end_cache]() { + const auto token = rw.get_token(); + if (!po.enable_structural_constraints) { + using SketchHitT = + mapping::util::sketch_hit_info_no_struct_constraint; + if (po.use_sam_format) { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } else { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } } else { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); + using SketchHitT = mapping::util::sketch_hit_info; + if (po.use_sam_format) { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } else { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } } - } - })); + })); } for (auto &w : workers) { @@ -1146,7 +1155,7 @@ int run_pesc_sc_atac(int argc, char **argv) { rparser.stop(); } else { using FragmentT = fastx_parser::ReadPair; - + auto num_input_files = po.single_read_filenames.size(); size_t additional_files = (num_input_files > 1) ? (num_input_files - 1) : 0; @@ -1171,46 +1180,49 @@ int run_pesc_sc_atac(int argc, char **argv) { piscem::unitig_end_cache_t unitig_end_cache(po.end_cache_capacity); std::vector workers; for (size_t i = 0; i < nthread; ++i) { - workers.push_back(std::thread([&ri, &po, &rparser, &binning, &ptab, - &global_nr, &global_nh, &global_nmult, - &k_match, &global_np, &out_info, &iomut, - &rw, &l_match, &r_match, &dove_match, - &dove_num, &ov_num, &ov_match, &r_orphan, - &l_orphan, &unitig_end_cache]() { - const auto token = rw.get_token(); - if (!po.enable_structural_constraints) { - using SketchHitT = - mapping::util::sketch_hit_info_no_struct_constraint; - if (po.use_sam_format) { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); - } else { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); - } - } else { - using SketchHitT = mapping::util::sketch_hit_info; - if (po.use_sam_format) { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); + workers.push_back(std::thread( + [&ri, &po, &rparser, &binning, &ptab, &global_nr, &global_nh, + &global_nmult, &k_match, &global_np, &out_info, &iomut, &rw, &l_match, + &r_match, &dove_match, &dove_num, &ov_num, &ov_match, &r_orphan, + &l_orphan, &unitig_end_cache]() { + const auto token = rw.get_token(); + if (!po.enable_structural_constraints) { + using SketchHitT = + mapping::util::sketch_hit_info_no_struct_constraint; + if (po.use_sam_format) { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } else { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } } else { - do_map( - ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, - k_match, l_match, r_match, dove_num, dove_match, ov_num, ov_match, - r_orphan, l_orphan, global_np, out_info, iomut, po.use_bed_format, - po.check_kmers_orphans, po.tn5_shift, po.use_chr, unitig_end_cache, rw, token); + using SketchHitT = mapping::util::sketch_hit_info; + if (po.use_sam_format) { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } else { + do_map( + ri, rparser, binning, ptab, global_nr, global_nh, global_nmult, + k_match, l_match, r_match, dove_num, dove_match, ov_num, + ov_match, r_orphan, l_orphan, global_np, out_info, iomut, + po.use_bed_format, po.check_kmers_orphans, po.tn5_shift, + po.use_chr, unitig_end_cache, rw, token); + } } - } - })); + })); } for (auto &w : workers) {