Skip to content

Commit 8572608

Browse files
Add RNTuple support for RAM genomic format (#1)
1 parent 8fb4f6c commit 8572608

11 files changed

+1442
-5
lines changed

CMakeLists.txt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
cmake_minimum_required(VERSION 3.16)
2+
3+
project(ramtools LANGUAGES CXX)
4+
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
8+
9+
find_package(ROOT 6.26 REQUIRED
10+
COMPONENTS Core RIO Tree ROOTNTuple)
11+
12+
set(RNTUPLE_LIBS ROOT::ROOTNTuple ROOT::ROOTNTupleUtil)
13+
14+
include(${ROOT_USE_FILE})
15+
16+
include_directories(${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/rntuple)
17+
18+
ROOT_STANDARD_LIBRARY_PACKAGE(ramrecord
19+
SOURCES ramrecord.C
20+
HEADERS ramrecord.h
21+
LIBRARIES ROOT::Core ROOT::RIO ROOT::Tree
22+
)
23+
24+
set(RAMTOOLS_ROOT_LIBS ROOT::Core ROOT::RIO ROOT::Tree)
25+
26+
ROOT_STANDARD_LIBRARY_PACKAGE(ramntuplerecord
27+
SOURCES rntuple/RAMntuplerecord.C
28+
HEADERS rntuple/RAMNTupleRecord.h
29+
LIBRARIES ROOT::Core ROOT::RIO ROOT::Tree ${RNTUPLE_LIBS}
30+
)
31+
32+
ROOT_EXECUTABLE(samtoramntuple
33+
rntuple/samtoramntuple.C
34+
LIBRARIES ramntuplerecord ${RAMTOOLS_ROOT_LIBS} ${RNTUPLE_LIBS})
35+
36+
include(CTest)
37+
file(GLOB RAMTESTS tests/test_*.C)
38+
foreach(testfile ${RAMTESTS})
39+
get_filename_component(testname ${testfile} NAME_WE)
40+
add_test(NAME ${testname}
41+
COMMAND root -l -b -q ${testfile}+)
42+
set_tests_properties(${testname} PROPERTIES WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
43+
endforeach()
44+

LinkDef.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#ifdef __CLING__
2+
#pragma link off all globals;
3+
#pragma link off all classes;
4+
#pragma link off all functions;
5+
6+
#pragma link C++ class RAMRefs+;
7+
#pragma link C++ class RAMIndex+;
8+
#pragma link C++ class RAMRecord+;
9+
#endif

Makefile

Lines changed: 0 additions & 4 deletions
This file was deleted.

ramrecord.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,14 @@
1111
#include <TObject.h>
1212
#include <TString.h>
1313
#include <TError.h>
14+
#include <TFile.h>
15+
#include <TTree.h>
1416
#include <iostream>
1517

18+
#include <vector>
19+
#include <map>
20+
#include <utility>
21+
1622

1723
class RAMRefs {
1824
private:

rntuple/RAMNTupleRecord.h

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
//
2+
// RAMNTupleRecord.h
3+
// Header for RAM (ROOT Alignment/Map) format
4+
5+
#ifndef RAMNTupleRecord_h
6+
#define RAMNTupleRecord_h
7+
8+
#include <ROOT/RNTuple.hxx>
9+
#include <ROOT/RNTupleModel.hxx>
10+
#include <ROOT/RNTupleWriter.hxx>
11+
#include <ROOT/RNTupleReader.hxx>
12+
#include <ROOT/RField.hxx>
13+
#include <ROOT/RNTupleView.hxx>
14+
15+
#include <string>
16+
#include <vector>
17+
#include <map>
18+
#include <memory>
19+
#include <cstdint>
20+
21+
namespace ROOT::Experimental {
22+
class RNTupleModel;
23+
class RNTupleWriter;
24+
class RNTupleReader;
25+
}
26+
27+
class RAMNTupleRefs;
28+
class RAMNTupleIndex;
29+
30+
// RAMNTupleRefs: Manages reference name to ID mappings
31+
class RAMNTupleRefs {
32+
private:
33+
std::vector<std::string> fRefVec;
34+
mutable int fLastId;
35+
mutable std::string fLastName;
36+
37+
public:
38+
RAMNTupleRefs();
39+
~RAMNTupleRefs() = default;
40+
41+
int GetRefId(const std::string& rname);
42+
const std::string& GetRefName(int rid) const;
43+
44+
void Print() const;
45+
size_t Size() const { return fRefVec.size(); }
46+
47+
// For RNTuple serialization
48+
void Clear() { fRefVec.clear(); fLastId = -1; fLastName.clear(); }
49+
void AddRef(const std::string& ref) { fRefVec.push_back(ref); }
50+
const std::vector<std::string>& GetRefs() const { return fRefVec; }
51+
void SetRefs(const std::vector<std::string>& refs) {
52+
fRefVec = refs;
53+
fLastId = -1;
54+
fLastName.clear();
55+
}
56+
};
57+
58+
// RAMNTupleIndex: Genomic position index for fast region queries
59+
class RAMNTupleIndex {
60+
public:
61+
struct IndexEntry {
62+
int32_t refid;
63+
int32_t pos;
64+
int64_t entry;
65+
};
66+
67+
private:
68+
std::vector<IndexEntry> fIndex;
69+
mutable std::map<std::pair<int32_t, int32_t>, int64_t> fIndexMap;
70+
71+
void RebuildMap() const;
72+
73+
public:
74+
RAMNTupleIndex() = default;
75+
~RAMNTupleIndex() = default;
76+
77+
void AddItem(int32_t refid, int32_t pos, int64_t row);
78+
int64_t GetRow(int32_t refid, int32_t pos) const;
79+
std::vector<int64_t> GetRowsInRange(int32_t refid, int32_t start, int32_t end) const;
80+
81+
void Print() const;
82+
size_t Size() const { return fIndex.size(); }
83+
84+
// For RNTuple serialization
85+
const std::vector<IndexEntry>& GetEntries() const { return fIndex; }
86+
void SetEntries(const std::vector<IndexEntry>& entries) {
87+
fIndex = entries;
88+
fIndexMap.clear();
89+
}
90+
void Clear() { fIndex.clear(); fIndexMap.clear(); }
91+
};
92+
// RAMNTupleRecord: Main record structure for genomic alignments
93+
class RAMNTupleRecord {
94+
public:
95+
// Quality compression options
96+
enum EQualCompressionBits {
97+
kPhred33 = 1 << 14, // Default Phred+33 quality score
98+
kIlluminaBinning = 1 << 15, // Illumina 8 bin compression
99+
kDrop = 1 << 16 // Drop quality score
100+
};
101+
102+
// Alignment data fields
103+
std::string qname; // Query template NAME
104+
uint16_t flag; // Bitwise FLAG
105+
int32_t refid; // Reference sequence ID
106+
int32_t pos; // 0-based left most mapping POSition
107+
uint8_t mapq; // MAPing Quality
108+
std::vector<uint32_t> cigar; // CIGAR operations
109+
int32_t refnext; // Reference ID of the mate/next read
110+
int32_t pnext; // 0-based position of the mate/next read
111+
int32_t tlen; // Observed Template LENgth
112+
std::string seq; // Segment sequence (encoded)
113+
std::string qual; // Quality scores (encoded)
114+
std::vector<std::string> tags; // Optional SAM tags
115+
116+
uint32_t compression_flags;
117+
118+
// Static reference and index managers
119+
static std::unique_ptr<RAMNTupleRefs> fgRnameRefs;
120+
static std::unique_ptr<RAMNTupleRefs> fgRnextRefs;
121+
static std::unique_ptr<RAMNTupleIndex> fgIndex;
122+
123+
public:
124+
RAMNTupleRecord();
125+
~RAMNTupleRecord() = default;
126+
127+
// Setters (SAM format, 1-based positions)
128+
void SetQNAME(const std::string& qname_) { qname = qname_; }
129+
void SetFLAG(uint16_t f) { flag = f; }
130+
void SetRNAME(const std::string& rname);
131+
void SetREFID(const std::string& rname) { SetRNAME(rname); }
132+
void SetPOS(int32_t pos_) { pos = pos_ - 1; } // SAM is 1-based, we are 0-based
133+
void SetMAPQ(uint8_t mapq_) { mapq = mapq_; }
134+
void SetCIGAR(const std::string& cigar_str);
135+
void SetRNEXT(const std::string& rnext);
136+
void SetREFNEXT(const std::string& rnext) { SetRNEXT(rnext); }
137+
void SetPNEXT(int32_t pnext_) { pnext = pnext_ - 1; } // SAM is 1-based, we are 0-based
138+
void SetTLEN(int32_t tlen_) { tlen = tlen_; }
139+
void SetSEQ(const std::string& seq_str);
140+
void SetQUAL(const std::string& qual_str);
141+
void AddTag(const std::string& tag) { tags.push_back(tag); }
142+
void SetOPT(const std::string& tag) { AddTag(tag); }
143+
void ClearTags() { tags.clear(); }
144+
void ResetNOPT() { ClearTags(); }
145+
146+
// Getters (SAM format, 1-based positions)
147+
const std::string& GetQNAME() const { return qname; }
148+
uint16_t GetFLAG() const { return flag; }
149+
std::string GetRNAME() const;
150+
int32_t GetREFID() const { return refid; }
151+
int32_t GetPOS() const { return pos + 1; } // Convert back to 1-based for SAM
152+
uint8_t GetMAPQ() const { return mapq; }
153+
std::string GetCIGAR() const;
154+
std::string GetRNEXT() const;
155+
int32_t GetREFNEXT() const { return refnext; }
156+
int32_t GetPNEXT() const { return pnext + 1; } // Convert back to 1-based for SAM
157+
int32_t GetTLEN() const { return tlen; }
158+
std::string GetSEQ() const;
159+
std::string GetQUAL() const;
160+
const std::vector<std::string>& GetTags() const { return tags; }
161+
int GetNOPT() const { return static_cast<int>(tags.size()); }
162+
const std::string& GetOPT(int idx) const { return tags[idx]; }
163+
164+
// Return sequence length without decoding (fast)
165+
int GetSEQLEN() const {
166+
if (seq.size() < 4) return 0;
167+
return *reinterpret_cast<const uint32_t*>(seq.data());
168+
}
169+
170+
size_t GetNCIGAROP() const { return cigar.size(); }
171+
int32_t GetCIGAROPLEN(size_t idx) const;
172+
int32_t GetCIGAROP(size_t idx) const;
173+
174+
void Print(const char* option = "") const;
175+
bool IsValid() const;
176+
void SetBit(uint32_t bit) { compression_flags |= bit; }
177+
bool TestBit(uint32_t bit) const { return compression_flags & bit; }
178+
179+
// Static managers
180+
static void InitializeRefs();
181+
static RAMNTupleRefs* GetRnameRefs() { return fgRnameRefs.get(); }
182+
static RAMNTupleRefs* GetRnextRefs() { return fgRnextRefs.get(); }
183+
static RAMNTupleIndex* GetIndex() { return fgIndex.get(); }
184+
185+
// File I/O
186+
static std::unique_ptr<ROOT::Experimental::RNTupleReader> OpenRAMFile(
187+
const std::string& filename, const std::string& ntupleName = "RAM");
188+
static void WriteAllRefs(TFile& file);
189+
static void ReadAllRefs(const std::string& filename = "");
190+
static void WriteIndex(TFile& file);
191+
static void ReadIndex(const std::string& filename = "");
192+
193+
// RNTuple model creation
194+
static std::unique_ptr<ROOT::Experimental::RNTupleModel> MakeModel();
195+
196+
void SetCompressionMode(uint32_t flags) { compression_flags = flags; }
197+
198+
private:
199+
static void WriteRefs(TFile& file, const RAMNTupleRefs* refs, const std::string& refname);
200+
static void ReadRefs(const std::string& filename, std::unique_ptr<RAMNTupleRefs>& refs, const std::string& refname);
201+
static void WriteRnameRefs(TFile& file) { WriteRefs(file, fgRnameRefs.get(), "RnameRefs"); }
202+
static void WriteRnextRefs(TFile& file) { WriteRefs(file, fgRnextRefs.get(), "RnextRefs"); }
203+
static void ReadRnameRefs(const std::string& filename = "") { ReadRefs(filename, fgRnameRefs, "RnameRefs"); }
204+
static void ReadRnextRefs(const std::string& filename = "") { ReadRefs(filename, fgRnextRefs, "RnextRefs"); }
205+
};
206+
// CIGAR operation codes (from BAM format)
207+
208+
constexpr uint8_t RAM_CIGAR_M = 0;
209+
constexpr uint8_t RAM_CIGAR_I = 1;
210+
constexpr uint8_t RAM_CIGAR_D = 2;
211+
constexpr uint8_t RAM_CIGAR_N = 3;
212+
constexpr uint8_t RAM_CIGAR_S = 4;
213+
constexpr uint8_t RAM_CIGAR_H = 5;
214+
constexpr uint8_t RAM_CIGAR_P = 6;
215+
constexpr uint8_t RAM_CIGAR_EQUAL = 7;
216+
constexpr uint8_t RAM_CIGAR_X = 8;
217+
218+
// Sequence and Quality utilities
219+
namespace RAMNTupleUtils {
220+
std::string EncodeSequence(const std::string& seq);
221+
std::string DecodeSequence(const std::string& encoded_seq, size_t length);
222+
223+
std::string EncodeQuality(const std::string& qual, uint32_t compression_flags);
224+
std::string DecodeQuality(const std::string& encoded_qual, uint32_t compression_flags);
225+
226+
std::vector<uint32_t> ParseCIGAR(const std::string& cigar_str);
227+
std::string FormatCIGAR(const std::vector<uint32_t>& cigar_ops);
228+
229+
extern const uint8_t kIlluminaBinning[256];
230+
}
231+
// SAM <-> RNTuple conversion utilities
232+
class RAMNTupleConverter {
233+
public:
234+
static void ConvertSAMToRAMNTuple(const std::string& sam_file,
235+
const std::string& ram_file,
236+
uint32_t compression_flags = RAMNTupleRecord::kPhred33);
237+
238+
static void ConvertRAMNTupleToSAM(const std::string& ram_file,
239+
const std::string& sam_file);
240+
241+
static void BuildIndex(const std::string& ram_file);
242+
243+
static void ViewRegion(const std::string& ram_file,
244+
const std::string& region);
245+
};
246+
247+
#endif
248+

0 commit comments

Comments
 (0)