Skip to content

Commit acbebf5

Browse files
committed
[ntuple] add streaming vector tutorial
1 parent 56575c2 commit acbebf5

File tree

1 file changed

+197
-0
lines changed

1 file changed

+197
-0
lines changed
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
/// \file
2+
/// \ingroup tutorial_ntuple
3+
///
4+
/// Example of a streaming vector: a special purpose container that reads large vectors piece-wise.
5+
///
6+
/// \macro_code
7+
///
8+
/// \date November 2024
9+
/// \author Peter van Gemmeren, the ROOT Team
10+
11+
#include <ROOT/RNTupleModel.hxx>
12+
#include <ROOT/RNTupleReader.hxx>
13+
#include <ROOT/RNTupleReadOptions.hxx>
14+
#include <ROOT/RNTupleRange.hxx>
15+
#include <ROOT/RNTupleTypes.hxx>
16+
#include <ROOT/RNTupleView.hxx>
17+
#include <ROOT/RNTupleWriter.hxx>
18+
19+
#include <TRandom3.h>
20+
21+
#include <cstdint>
22+
#include <iostream>
23+
#include <vector>
24+
#include <utility>
25+
26+
constexpr char const *kFileName = "ntpl015_streaming_vector.root";
27+
constexpr char const *kNTupleName = "ntpl";
28+
constexpr char const *kFieldName = "LargeVector";
29+
constexpr unsigned int kNEvents = 10;
30+
constexpr unsigned int kVectorSize = 1000000;
31+
32+
// Create an RNTuple with a single vector field. Every entry contains a large vector of random integers.
33+
// The vector should be seen as too large to be held entirely in memory during reading.
34+
void CreateRNTuple()
35+
{
36+
auto model = ROOT::RNTupleModel::Create();
37+
auto ptrLargeVector = model->MakeField<std::vector<std::uint32_t>>(kFieldName);
38+
auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), kNTupleName, kFileName);
39+
40+
auto prng = std::make_unique<TRandom3>();
41+
prng->SetSeed();
42+
43+
for (ROOT::NTupleSize_t i = 0; i < kNEvents; i++) {
44+
ptrLargeVector->clear();
45+
for (std::size_t j = 0; j < kVectorSize; j++)
46+
ptrLargeVector->emplace_back(prng->Integer(-1));
47+
writer->Fill();
48+
}
49+
std::cout << "RNTuple written" << std::endl;
50+
}
51+
52+
/*
53+
* ==================================================================================================
54+
*/
55+
56+
// For comparison, the canonical read function that reads the entire vector for every entry.
57+
void ReadRNTupleSimple()
58+
{
59+
auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName);
60+
61+
const auto nEntries = reader->GetNEntries();
62+
std::cout << "Simple reading, found " << nEntries << " entries" << std::endl;
63+
64+
auto ptrLargeVector = reader->GetModel().GetDefaultEntry().GetPtr<std::vector<std::uint32_t>>(kFieldName);
65+
for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) {
66+
reader->LoadEntry(i);
67+
68+
const auto vectorSize = ptrLargeVector->size();
69+
uint64_t sum = 0;
70+
for (auto val : *ptrLargeVector)
71+
sum += val;
72+
73+
std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl;
74+
}
75+
std::cout << "RNTuple simple read" << std::endl;
76+
}
77+
78+
/*
79+
* ==================================================================================================
80+
*/
81+
82+
// The StreamingVector class allows iteration over an RNTuple on-disk vector of element type T.
83+
// Unlike an std::vector, this class does not provide random-access but only allows to iterate the data elements
84+
// from beginning to end.
85+
// Internally, it uses an RNTupleCollection view and an item view to load chunks of the vector elements into memory,
86+
// so that never the entire vector needs to stay in memory.
87+
// Note that we don't need to implement loading chunks of data explicitly. Simply by asking for a single vector element
88+
// at every iteration step, the RNTuple views will take care of keeping only the currently required data pages
89+
// in memory.
90+
template <class T>
91+
class StreamingVector {
92+
// For a certain entry, the collection view provides the information about the size of the collection and
93+
// the index range of the item view, which is required to read the values of the collection at hand.
94+
ROOT::RNTupleCollectionView fVectorView;
95+
// The "data view" provides access to the vector elements
96+
ROOT::RNTupleView<T> fItemView;
97+
// Given an entry number, the start end end index in the item view to read the corresponding vector elements
98+
ROOT::RNTupleLocalRange fRange{ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex};
99+
// The index of the entry from which the vector should be read
100+
ROOT::NTupleSize_t fEntry{0};
101+
// The size of the collection in fEntry
102+
ROOT::NTupleSize_t fSize{0};
103+
104+
public:
105+
// A lightweight iterator used in StreamingVector::begin() and StreamingVector::end(). Used to iterate over the
106+
// elements of an RNTuple on-disk vector for a certain entry. Dereferencing the iterator returns the corresponding
107+
// value of the item view.
108+
class iterator {
109+
ROOT::RNTupleLocalRange::RIterator fRangeItr;
110+
ROOT::RNTupleView<T> &fView;
111+
112+
public:
113+
iterator(ROOT::RNTupleLocalRange::RIterator rangeItr, ROOT::RNTupleView<T> &view)
114+
: fRangeItr(rangeItr), fView(view)
115+
{
116+
}
117+
118+
iterator operator++(int) /* postfix */
119+
{
120+
auto r = *this;
121+
++(*this);
122+
return r;
123+
}
124+
iterator &operator++() /* prefix */
125+
{
126+
++fRangeItr;
127+
return *this;
128+
}
129+
const T &operator*() { return fView.operator()(*fRangeItr); }
130+
const T *operator->() { return &fView.operator()(*fRangeItr); }
131+
bool operator==(const iterator &rh) const { return fRangeItr == rh.fRangeItr; }
132+
bool operator!=(const iterator &rh) const { return fRangeItr != rh.fRangeItr; }
133+
};
134+
135+
explicit StreamingVector(ROOT::RNTupleCollectionView vectorView)
136+
: fVectorView(std::move(vectorView)), fItemView(fVectorView.GetView<T>("_0"))
137+
{
138+
}
139+
140+
//StreamingVector(ROOT::RNTupleReader &reader, std::string_view fieldName)
141+
// : fVectorView(reader.GetCollectionView(fieldName)), fItemView(fVectorView.GetView<T>("_0")),
142+
// fRange(ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex)
143+
//{
144+
//}
145+
146+
ROOT::NTupleSize_t size() const { return fSize; }
147+
148+
// The begin() and end() methods enable range-based for loops like `for (auto val : streamingVector)`
149+
iterator begin() { return iterator(fRange.begin(), fItemView); }
150+
iterator end() { return iterator(fRange.end(), fItemView); }
151+
152+
void LoadEntry(ROOT::NTupleSize_t entry)
153+
{
154+
fEntry = entry;
155+
fRange = fVectorView.GetCollectionRange(fEntry);
156+
fSize = fVectorView.operator()(fEntry);
157+
}
158+
};
159+
160+
// For the streaming vector read, we use a custom class `StreamingVector` that implements the piece-wise
161+
// loading of the data during iteration of elements of the on-disk vector. The class has been built such that
162+
// the event loop is almost identical to the simple reading case above.
163+
void ReadRNTupleStreamingVector()
164+
{
165+
ROOT::RNTupleReadOptions options;
166+
// Don't preload data; we want to populate data into memory only as needed
167+
options.SetClusterCache(ROOT::RNTupleReadOptions::EClusterCache::kOff);
168+
auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName, options);
169+
170+
const auto nEntries = reader->GetNEntries();
171+
std::cout << "Streamed reading, found " << nEntries << " entries" << std::endl;
172+
173+
StreamingVector<std::uint32_t> streamingVector(reader->GetCollectionView(kFieldName));
174+
175+
for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) {
176+
// Instead of `reader->LoadEntry()`, we tell the streaming vector which entry we want to read.
177+
streamingVector.LoadEntry(i);
178+
179+
// We can ask for the size of the vector without loading the data
180+
const auto vectorSize = streamingVector.size();
181+
182+
// The iteration works exactly as in the simple case
183+
uint64_t sum = 0;
184+
for (auto val : streamingVector)
185+
sum += val;
186+
187+
std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl;
188+
}
189+
std::cout << "RNTuple streaming read" << std::endl;
190+
}
191+
192+
void ntpl016_streaming_vector()
193+
{
194+
CreateRNTuple();
195+
ReadRNTupleSimple();
196+
ReadRNTupleStreamingVector();
197+
}

0 commit comments

Comments
 (0)