|
| 1 | +// |
| 2 | +// RAMNTupleRecord.h |
| 3 | +// Header for RAM (ROOT Alignment/Map) format |
| 4 | + |
| 5 | +#ifndef RAMNTupleRecord_h |
| 6 | +#define RAMNTupleRecord_h |
| 7 | + |
| 8 | +#include <ROOT/RNTuple.hxx> |
| 9 | +#include <ROOT/RNTupleModel.hxx> |
| 10 | +#include <ROOT/RNTupleWriter.hxx> |
| 11 | +#include <ROOT/RNTupleReader.hxx> |
| 12 | +#include <ROOT/RField.hxx> |
| 13 | +#include <ROOT/RNTupleView.hxx> |
| 14 | + |
| 15 | +#include <string> |
| 16 | +#include <vector> |
| 17 | +#include <map> |
| 18 | +#include <memory> |
| 19 | +#include <cstdint> |
| 20 | + |
| 21 | +namespace ROOT::Experimental { |
| 22 | + class RNTupleModel; |
| 23 | + class RNTupleWriter; |
| 24 | + class RNTupleReader; |
| 25 | +} |
| 26 | + |
| 27 | +class RAMNTupleRefs; |
| 28 | +class RAMNTupleIndex; |
| 29 | + |
| 30 | +// RAMNTupleRefs: Manages reference name to ID mappings |
| 31 | +class RAMNTupleRefs { |
| 32 | +private: |
| 33 | + std::vector<std::string> fRefVec; |
| 34 | + mutable int fLastId; |
| 35 | + mutable std::string fLastName; |
| 36 | + |
| 37 | +public: |
| 38 | + RAMNTupleRefs(); |
| 39 | + ~RAMNTupleRefs() = default; |
| 40 | + |
| 41 | + int GetRefId(const std::string& rname); |
| 42 | + const std::string& GetRefName(int rid) const; |
| 43 | + |
| 44 | + void Print() const; |
| 45 | + size_t Size() const { return fRefVec.size(); } |
| 46 | + |
| 47 | + // For RNTuple serialization |
| 48 | + void Clear() { fRefVec.clear(); fLastId = -1; fLastName.clear(); } |
| 49 | + void AddRef(const std::string& ref) { fRefVec.push_back(ref); } |
| 50 | + const std::vector<std::string>& GetRefs() const { return fRefVec; } |
| 51 | + void SetRefs(const std::vector<std::string>& refs) { |
| 52 | + fRefVec = refs; |
| 53 | + fLastId = -1; |
| 54 | + fLastName.clear(); |
| 55 | + } |
| 56 | +}; |
| 57 | + |
| 58 | +// RAMNTupleIndex: Genomic position index for fast region queries |
| 59 | +class RAMNTupleIndex { |
| 60 | +public: |
| 61 | + struct IndexEntry { |
| 62 | + int32_t refid; |
| 63 | + int32_t pos; |
| 64 | + int64_t entry; |
| 65 | + }; |
| 66 | + |
| 67 | +private: |
| 68 | + std::vector<IndexEntry> fIndex; |
| 69 | + mutable std::map<std::pair<int32_t, int32_t>, int64_t> fIndexMap; |
| 70 | + |
| 71 | + void RebuildMap() const; |
| 72 | + |
| 73 | +public: |
| 74 | + RAMNTupleIndex() = default; |
| 75 | + ~RAMNTupleIndex() = default; |
| 76 | + |
| 77 | + void AddItem(int32_t refid, int32_t pos, int64_t row); |
| 78 | + int64_t GetRow(int32_t refid, int32_t pos) const; |
| 79 | + std::vector<int64_t> GetRowsInRange(int32_t refid, int32_t start, int32_t end) const; |
| 80 | + |
| 81 | + void Print() const; |
| 82 | + size_t Size() const { return fIndex.size(); } |
| 83 | + |
| 84 | + // For RNTuple serialization |
| 85 | + const std::vector<IndexEntry>& GetEntries() const { return fIndex; } |
| 86 | + void SetEntries(const std::vector<IndexEntry>& entries) { |
| 87 | + fIndex = entries; |
| 88 | + fIndexMap.clear(); |
| 89 | + } |
| 90 | + void Clear() { fIndex.clear(); fIndexMap.clear(); } |
| 91 | +}; |
| 92 | +// RAMNTupleRecord: Main record structure for genomic alignments |
| 93 | +class RAMNTupleRecord { |
| 94 | +public: |
| 95 | + // Quality compression options |
| 96 | + enum EQualCompressionBits { |
| 97 | + kPhred33 = 1 << 14, // Default Phred+33 quality score |
| 98 | + kIlluminaBinning = 1 << 15, // Illumina 8 bin compression |
| 99 | + kDrop = 1 << 16 // Drop quality score |
| 100 | + }; |
| 101 | + |
| 102 | + // Alignment data fields |
| 103 | + std::string qname; // Query template NAME |
| 104 | + uint16_t flag; // Bitwise FLAG |
| 105 | + int32_t refid; // Reference sequence ID |
| 106 | + int32_t pos; // 0-based left most mapping POSition |
| 107 | + uint8_t mapq; // MAPing Quality |
| 108 | + std::vector<uint32_t> cigar; // CIGAR operations |
| 109 | + int32_t refnext; // Reference ID of the mate/next read |
| 110 | + int32_t pnext; // 0-based position of the mate/next read |
| 111 | + int32_t tlen; // Observed Template LENgth |
| 112 | + std::string seq; // Segment sequence (encoded) |
| 113 | + std::string qual; // Quality scores (encoded) |
| 114 | + std::vector<std::string> tags; // Optional SAM tags |
| 115 | + |
| 116 | + uint32_t compression_flags; |
| 117 | + |
| 118 | + // Static reference and index managers |
| 119 | + static std::unique_ptr<RAMNTupleRefs> fgRnameRefs; |
| 120 | + static std::unique_ptr<RAMNTupleRefs> fgRnextRefs; |
| 121 | + static std::unique_ptr<RAMNTupleIndex> fgIndex; |
| 122 | + |
| 123 | +public: |
| 124 | + RAMNTupleRecord(); |
| 125 | + ~RAMNTupleRecord() = default; |
| 126 | + |
| 127 | + // Setters (SAM format, 1-based positions) |
| 128 | + void SetQNAME(const std::string& qname_) { qname = qname_; } |
| 129 | + void SetFLAG(uint16_t f) { flag = f; } |
| 130 | + void SetRNAME(const std::string& rname); |
| 131 | + void SetREFID(const std::string& rname) { SetRNAME(rname); } |
| 132 | + void SetPOS(int32_t pos_) { pos = pos_ - 1; } // SAM is 1-based, we are 0-based |
| 133 | + void SetMAPQ(uint8_t mapq_) { mapq = mapq_; } |
| 134 | + void SetCIGAR(const std::string& cigar_str); |
| 135 | + void SetRNEXT(const std::string& rnext); |
| 136 | + void SetREFNEXT(const std::string& rnext) { SetRNEXT(rnext); } |
| 137 | + void SetPNEXT(int32_t pnext_) { pnext = pnext_ - 1; } // SAM is 1-based, we are 0-based |
| 138 | + void SetTLEN(int32_t tlen_) { tlen = tlen_; } |
| 139 | + void SetSEQ(const std::string& seq_str); |
| 140 | + void SetQUAL(const std::string& qual_str); |
| 141 | + void AddTag(const std::string& tag) { tags.push_back(tag); } |
| 142 | + void SetOPT(const std::string& tag) { AddTag(tag); } |
| 143 | + void ClearTags() { tags.clear(); } |
| 144 | + void ResetNOPT() { ClearTags(); } |
| 145 | + |
| 146 | + // Getters (SAM format, 1-based positions) |
| 147 | + const std::string& GetQNAME() const { return qname; } |
| 148 | + uint16_t GetFLAG() const { return flag; } |
| 149 | + std::string GetRNAME() const; |
| 150 | + int32_t GetREFID() const { return refid; } |
| 151 | + int32_t GetPOS() const { return pos + 1; } // Convert back to 1-based for SAM |
| 152 | + uint8_t GetMAPQ() const { return mapq; } |
| 153 | + std::string GetCIGAR() const; |
| 154 | + std::string GetRNEXT() const; |
| 155 | + int32_t GetREFNEXT() const { return refnext; } |
| 156 | + int32_t GetPNEXT() const { return pnext + 1; } // Convert back to 1-based for SAM |
| 157 | + int32_t GetTLEN() const { return tlen; } |
| 158 | + std::string GetSEQ() const; |
| 159 | + std::string GetQUAL() const; |
| 160 | + const std::vector<std::string>& GetTags() const { return tags; } |
| 161 | + int GetNOPT() const { return static_cast<int>(tags.size()); } |
| 162 | + const std::string& GetOPT(int idx) const { return tags[idx]; } |
| 163 | + |
| 164 | + // Return sequence length without decoding (fast) |
| 165 | + int GetSEQLEN() const { |
| 166 | + if (seq.size() < 4) return 0; |
| 167 | + return *reinterpret_cast<const uint32_t*>(seq.data()); |
| 168 | + } |
| 169 | + |
| 170 | + size_t GetNCIGAROP() const { return cigar.size(); } |
| 171 | + int32_t GetCIGAROPLEN(size_t idx) const; |
| 172 | + int32_t GetCIGAROP(size_t idx) const; |
| 173 | + |
| 174 | + void Print(const char* option = "") const; |
| 175 | + bool IsValid() const; |
| 176 | + void SetBit(uint32_t bit) { compression_flags |= bit; } |
| 177 | + bool TestBit(uint32_t bit) const { return compression_flags & bit; } |
| 178 | + |
| 179 | + // Static managers |
| 180 | + static void InitializeRefs(); |
| 181 | + static RAMNTupleRefs* GetRnameRefs() { return fgRnameRefs.get(); } |
| 182 | + static RAMNTupleRefs* GetRnextRefs() { return fgRnextRefs.get(); } |
| 183 | + static RAMNTupleIndex* GetIndex() { return fgIndex.get(); } |
| 184 | + |
| 185 | + // File I/O |
| 186 | + static std::unique_ptr<ROOT::Experimental::RNTupleReader> OpenRAMFile( |
| 187 | + const std::string& filename, const std::string& ntupleName = "RAM"); |
| 188 | + static void WriteAllRefs(TFile& file); |
| 189 | + static void ReadAllRefs(const std::string& filename = ""); |
| 190 | + static void WriteIndex(TFile& file); |
| 191 | + static void ReadIndex(const std::string& filename = ""); |
| 192 | + |
| 193 | + // RNTuple model creation |
| 194 | + static std::unique_ptr<ROOT::Experimental::RNTupleModel> MakeModel(); |
| 195 | + |
| 196 | + void SetCompressionMode(uint32_t flags) { compression_flags = flags; } |
| 197 | + |
| 198 | +private: |
| 199 | + static void WriteRefs(TFile& file, const RAMNTupleRefs* refs, const std::string& refname); |
| 200 | + static void ReadRefs(const std::string& filename, std::unique_ptr<RAMNTupleRefs>& refs, const std::string& refname); |
| 201 | + static void WriteRnameRefs(TFile& file) { WriteRefs(file, fgRnameRefs.get(), "RnameRefs"); } |
| 202 | + static void WriteRnextRefs(TFile& file) { WriteRefs(file, fgRnextRefs.get(), "RnextRefs"); } |
| 203 | + static void ReadRnameRefs(const std::string& filename = "") { ReadRefs(filename, fgRnameRefs, "RnameRefs"); } |
| 204 | + static void ReadRnextRefs(const std::string& filename = "") { ReadRefs(filename, fgRnextRefs, "RnextRefs"); } |
| 205 | +}; |
| 206 | +// CIGAR operation codes (from BAM format) |
| 207 | + |
| 208 | +constexpr uint8_t RAM_CIGAR_M = 0; |
| 209 | +constexpr uint8_t RAM_CIGAR_I = 1; |
| 210 | +constexpr uint8_t RAM_CIGAR_D = 2; |
| 211 | +constexpr uint8_t RAM_CIGAR_N = 3; |
| 212 | +constexpr uint8_t RAM_CIGAR_S = 4; |
| 213 | +constexpr uint8_t RAM_CIGAR_H = 5; |
| 214 | +constexpr uint8_t RAM_CIGAR_P = 6; |
| 215 | +constexpr uint8_t RAM_CIGAR_EQUAL = 7; |
| 216 | +constexpr uint8_t RAM_CIGAR_X = 8; |
| 217 | + |
| 218 | +// Sequence and Quality utilities |
| 219 | +namespace RAMNTupleUtils { |
| 220 | + std::string EncodeSequence(const std::string& seq); |
| 221 | + std::string DecodeSequence(const std::string& encoded_seq, size_t length); |
| 222 | + |
| 223 | + std::string EncodeQuality(const std::string& qual, uint32_t compression_flags); |
| 224 | + std::string DecodeQuality(const std::string& encoded_qual, uint32_t compression_flags); |
| 225 | + |
| 226 | + std::vector<uint32_t> ParseCIGAR(const std::string& cigar_str); |
| 227 | + std::string FormatCIGAR(const std::vector<uint32_t>& cigar_ops); |
| 228 | + |
| 229 | + extern const uint8_t kIlluminaBinning[256]; |
| 230 | +} |
| 231 | +// SAM <-> RNTuple conversion utilities |
| 232 | +class RAMNTupleConverter { |
| 233 | +public: |
| 234 | + static void ConvertSAMToRAMNTuple(const std::string& sam_file, |
| 235 | + const std::string& ram_file, |
| 236 | + uint32_t compression_flags = RAMNTupleRecord::kPhred33); |
| 237 | + |
| 238 | + static void ConvertRAMNTupleToSAM(const std::string& ram_file, |
| 239 | + const std::string& sam_file); |
| 240 | + |
| 241 | + static void BuildIndex(const std::string& ram_file); |
| 242 | + |
| 243 | + static void ViewRegion(const std::string& ram_file, |
| 244 | + const std::string& region); |
| 245 | +}; |
| 246 | + |
| 247 | +#endif |
| 248 | + |
0 commit comments